def test_tag(self): # Assert [("black", "JJ"), ("cats", "NNS")]. v = en.tag("black cats") self.assertEqual(v, [("black", "JJ"), ("cats", "NNS")]) v = en.tag("") self.assertEqual(v, []) print "pattern.en.tag()"
def get(self, description, index = False, field = False, debug = False, fields = False): """Get automaticly authorities (All proper nouns) from descriptions, links them to said descriptions and return an index of links and authorities Keyword arguments: description --- Either a description from an EHRI.get() descriptions list or an EHRI.get() descriptions list index --- Index of items, if already exists field --- Field to query, default is scopeAndContent debug --- Debug mode : print details during execution fields --- If more than one field """ if index: self.index = index if field: self.field = field if debug: self.debug = debug #If Description is a list of description, then we run a loop on it if isinstance(description, list): for element in description: self.get(element, fields = fields) else: if self.debug: print "Handling Item Id " + description[self.identifier] try: if fields: tokens = tag(". ".join([description[item] for item in description if item in fields])) else: tokens = tag(description[self.field]) except: print "Tokenization failed for " + self.field sys.exit() i = 0 entities = [] while i < len(tokens): #Setting up temp variables name, pos = tokens[i] z = 1 if pos == "NNP": entity_name = name #if tokens[i+z]: if i + z + 1 < len(tokens): while tokens[i+z][1] == "NNP" or (z + i + 1 < len(tokens) and tokens[i+z][0].lower() == "of" and tokens[i+z+1][1] == "NNP") : entity_name += " " + tokens[i+z][0] z += 1 #Breaking it if not anymore in index range if z + i == len(tokens): break self.index["authorities"].append(entity_name) if description["idDoc"] not in self.index["items"]: self.index["items"][description["idDoc"]] = [] self.index["items"][description["idDoc"]].append(entity_name) i += z return self.index
def transform(self, text, less, more): """transforms a body of text to have less of less and more of more! :param text: text to transform :type text: str :param less: list of 'less' words :type less: list :param more: list of 'more' words :type more: list :returns: transformed text :rtype: str """ last_was_article = False new_text = [] less = [l for l in less if not self._ignore(l, en.tag(l)[0][1])] more = [m for m in more if not self._ignore(m, en.tag(m)[0][1])] # iterate over words for word, pos in en.tag(text): if word not in self.model or self._ignore(word, pos): if self._is_punc(pos): new_text.append(u'\b' + word) else: new_text.append(word) else: new_word = self._transform_word(word, pos, less, more) # handle 'a' v. 'an' if new_text and new_text[-1] in ['a', 'an']: new_text[-1] = 'an' if new_word[0] in 'aeiou' else 'a' new_text.append(new_word) ret = '' # remove at backspaces - this is dumb for t in new_text: if t.startswith('\b'): ret += t[1:] else: ret += ((' ' + t) if ret != '' else t) return ret
def create_description(self): pat = 'VB|VBD|VBZ|VBG * NN IN * NN' #pat = 'PRP * VB|VBD|VBZ|VBG * NN' phrases = search.search_out(self.source_text, pat) conjugated_phrases = [] for phrase in phrases: words = [] for word, pos in tag(phrase): if pos in ["VBZ", "VBD", "VB", "VBG"]: words.append(conjugate(word, "3sg")) #elif pos == "NN" and random.random() < .1: #words.append(self.define_word(word)) else: words.append(word) conjugated_phrases.append(' '.join(words)) artifacts = list(self.artifacts) sentence_prefixes = ["The present invention", "The device", "The invention"] paragraph_prefixes = ["The present invention", "According to a beneficial embodiment, the invention", "According to another embodiment, the device", "According to a preferred embodiment, the invention", "In accordance with an alternative specific embodiment, the present invention"] i = 0 self.description = '' for phrase in conjugated_phrases: line = "" if i == 0: line = paragraph_prefixes[0] + " " + phrase else: if random.random() < .1: line = "\n\n" + random.choice(paragraph_prefixes) + " " + phrase else: line = random.choice(sentence_prefixes) + " " + phrase self.description += line + ". " i += 1
def posify(self, bottish, english, dictionary): # TODO incomplete - add more tags! tags = { ("NN", "NNS"): "noun", ("NNP", "NNPS"): "prop. noun", "CC": "conj.", "DT": "det.", "JJ": "adj.", ("PRP", "PRP$"): "pers. pron", ("RB", "RBS"): "adv.", "UH": "interj.", ("VB", "VBD", "VBG", "VBN", "VBP", "VBZ"): "verb" } pos = "" for key in tags.keys(): if tag(english)[0][1] in key: pos = tags[key] if pos == "noun": bottish = self.pluralify(bottish, english, dictionary) elif pos == "adj.": bottish = self.adjectify(bottish, english, dictionary) #elif pos == "verb": # bottish = self.conjugify(bottish, english, dictionary) return [bottish, pos]
def getEntities(parser, tweet, xEntities): try: spacyParsedObject = parser(tweet) sentence = TextBlob(tweet) textblobTaggedObject = sentence.parse().split() patterntaggedObject = tag(tweet, tokenize=True) for word in patterntaggedObject: word, wordtag=word if wordtag == "NNP" or wordtag == "NN" or wordtag == "PRP": v = str(word) v = v.strip() if(v not in xEntities): xEntities[v]=str(wordtag) for taggedObject in textblobTaggedObject: for word in taggedObject: word, wordtag=word[0], word[1] if wordtag == "NNP" or wordtag == "NN" or wordtag == "PRP": v = str(word) v = v.strip() if(v not in xEntities): xEntities[v]=str(wordtag) for word in spacyParsedObject: if word.tag_ == "NNP" or word.tag_ == "NN" or word.tag_ == "PRP": v = str(word) v = v.strip() if(v not in xEntities): xEntities[v]=str(word.tag_) return xEntities except Exception as e: return e
def clean_text_by_word(text, deacc=True): """Tokenize a given text into words, applying filters and lemmatize them. Parameters ---------- text : str Given text. deacc : bool, optional Remove accentuation if True. Returns ------- dict Words as keys, :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` as values. Example ------- >>> from gensim.summarization.textcleaner import clean_text_by_word >>> clean_text_by_word("God helps those who help themselves") {'god': Original unit: 'god' *-*-*-* Processed unit: 'god', 'help': Original unit: 'help' *-*-*-* Processed unit: 'help', 'helps': Original unit: 'helps' *-*-*-* Processed unit: 'help'} """ text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=deacc)) filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)] if HAS_PATTERN: tags = tag(join_words(original_words)) # tag needs the context of the words in the text else: tags = None units = merge_syntactic_units(original_words, filtered_words, tags) return {unit.text: unit for unit in units}
def getImage(refstring): tagged = tag(refstring) nouns = [word for word,pos in tagged if pos == 'NNP' or pos == 'NP' or pos == 'NN'] try: query = random.choice(nouns) except IndexError: #somehow this string has no nouns! if DEBUG: print("Paragraph with no nouns:\n" + refstring, file=sys.stderr) return None if DEBUG: print(query, file=sys.stderr) flickr = flickrapi.FlickrAPI(flickr_key, flickr_secret, format='parsed-json') result = flickr.photos_search(api_key = flickr_key, text = query, privacy_filter = 1, safe_search=1, sort='interestingness-desc', orientation="landscape") try: pick = random.choice(result['photos']['photo']) url = 'https://farm' + str(pick['farm']) + '.staticflickr.com/' + str(pick['server']) + '/' + str(pick['id']) + '_' + str(pick['secret']) + '_z.jpg' except IndexError: # there were no results, so the random.choice call failed above. This is OK, we'll just move on. url = None image = {} image['url'] = url image['noun'] = query return image
def buildVectorizer(bio): nounlist = [] for doc in bio: st = "" for (word, pos) in tag(doc): if pos in ["JJ", "NNS", "NN", "NNP"]: st = st+word+" " else: if st!= "": st = st[0:-1]+" " #print "got one" nounlist.extend([st]) sciencestopwords = set([u'model','according', 'data', u'models', 'function', 'properties', 'approach', 'parameters', 'systems', 'number', 'order', u'data', 'analysis', u'information', u'journal', 'results','using','research', 'consumers', 'scientists', 'model', 'models', 'journal', 'researchers','paper','new','study','time','case', 'simulation', u'simulation', 'equation', 'based','years','better', 'theory', 'particular','many','due','much','set', 'studies', 'systems', 'simple', 'example','work','non','experiments', 'large', 'small', 'experiment', u'experiments', 'provide', 'analysis', 'problem', 'method', 'used', 'methods']) #now doing the new vectorizer from sklearn.feature_extraction.text import TfidfVectorizer english = nltk.corpus.stopwords.words('english') newstop = english+list(sciencestopwords) vectorizer = TfidfVectorizer(min_df=1, max_df=.5, stop_words=newstop, decode_error='ignore') X = vectorizer.fit_transform(nounlist) Xinv = vectorizer.inverse_transform(X) #X is a sparse matrix of docs x vocab size (7638). #so X[doc_num] is the sparse vector of its words. #the ||X[doc_num]|| = 1 there are 7638 unique words and 755 docs. with a total number of 38888 non-zeros. #Xinv[doc_num] is the list of words in the doc. return nounlist, vectorizer, X, Xinv
def _transform_word(self, word, pos, less, more): """transforms a word to be less less and more more :param word: word to transform :type word: str :param pos: part of speech of the word :type pos: str :param less: list of 'less' words :type less: list :param more: list of 'more' words :type more: list :returns: transformed word :rtype: str """ new_word = self._get_similar_word(word, less, more) new_pos = en.tag(new_word)[0][1] if (pos[:2] != new_pos[:2]) or word == new_word: return word # handle noun if pos.startswith('NN'): # pluralization if pos.endswith('S') and not new_pos.endswith('S'): new_word = en.pluralize(new_word) elif not pos.endswith('S') and new_pos.endswith('S'): new_word = en.singularize(new_word) # capitalization if word[0].isupper(): new_word = new_word[0].upper() + new_word[1:] else: new_word = new_word.lower() # handle verb elif pos.startswith('VB'): tense, person, number = en.tenses(word)[0][:3] # conjugation conjugated = en.conjugate(new_word, tense=tense, person=person, number=number, parse=False) if conjugated is not None: new_word = conjugated # remove underscores for joint words new_word = new_word.replace('_', ' ') return new_word
def parse_pos(source_filename, output_filename, pos): wordlist = list(open(source_filename).read().split()) matched_words = [word for word in wordlist if tag(word)[0][1] == pos] fp = open(output_filename, "w") for w in matched_words: fp.write("%s\n" % w) fp.close()
def denver_lyrics(link): link = link.replace('..', 'http://www.azlyrics.com/') html = urlopen(link).read() soup = BeautifulSoup(html) lyrics = str(''.join(''.join([s.text for s in soup.findAll('div')]).split('lyrics')[1:]).split('\n\n\n\n\r\nif')[0].strip()).replace('LYRICS', "").replace('JOHN DENVER', '') lyrics = [x for x in lyrics.splitlines() if x] lyrics = ' '.join([str(x) for x in lyrics if x[0] not in ['[', ' ']]) exclude = set(string.punctuation) lyrics = ''.join(ch.lower() for ch in lyrics if ch not in exclude) lyrics = lyrics.split() nounlist = [] for word in lyrics: if tag(word)[0][1] in ['NN', 'NNP'] and len(word)>2 and tag(word) not in nounlist: nounlist.append(tag(word)) return nounlist #http://www.azlyrics.com/n/neildiamond.html #http://www.azlyrics.com/j/johndenver.html
def tokenizer(self,url): #page = URL(url).download(user_agent='Mozilla/5') page = URL(url).download() text = plaintext(page, keep={}) tokens = tag(text) #print tokens print len(tokens),' words' document=[] while tokens: document.append(tokens.pop(0)[0])
def process(wrd): tmp = '' ignore_pos = ['IN', 'RP', 'TO'] exception_lemma = ['flatter', 'flattered'] if tag(wrd)[0][1] in ignore_pos: tmp = wrd elif any(wrd in ex_l for ex_l in exception_lemma): tmp = wrd else: tmp = conjugate(wrd, tense=PAST) return tmp
def run(o): # https://github.com/clips/pattern/blob/master/examples/03-en/03-parse.py import os, sys;# sys.path.insert(0, os.path.join("..", "..")) from pattern.en import parse, pprint, tag # The en module contains a fast regular expressions-based parser. # A parser identifies words in a sentence, word part-of-speech tags (e.g. noun, verb) # and groups of words that belong together (e.g. noun phrases). # Common part-of-speech tags: NN (noun), VB (verb), JJ (adjective), PP (preposition). # A tag can have a suffix, for example NNS (plural noun) or VBG (gerund verb). # Overview of tags: http://www.clips.ua.ac.be/pages/mbsp-tags s = "I eat pizza with a fork. one more test 1 Africa James Bob England Surrey Essex" s = parse(s, tokenize = True, # Tokenize the input, i.e. split punctuation from words. tags = True, # Find part-of-speech tags. chunks = True, # Find chunk tags, e.g. "the black cat" = NP = noun phrase. relations = True, # Find relations between chunks. lemmata = True, # Find word lemmata. light = False) # The light parameter determines how unknown words are handled. # By default, unknown words are tagged NN and then improved with a set of rules. # light=False uses Brill's lexical and contextual rules, # light=True uses a set of custom rules that is less accurate but faster (5x-10x). # The output is a string with each sentence on a new line. # Words in a sentence have been annotated with tags, # for example: fork/NN/I-NP/I-PNP # NN = noun, NP = part of a noun phrase, PNP = part of a prepositional phrase. print s print # Prettier output can be obtained with the pprint() command: pprint(s) print # The string's split() method will (unless a split character is given), # split into a list of sentences, where each sentence is a list of words # and each word is a list with the word + its tags. print s.split() print # The tag() command returns a list of (word, POS-tag)-tuples. # With light=True, this is the fastest and simplest way to get an idea # of a sentence's constituents: s = "I eat pizza with a fork. one more test 1 Africa James Bob England Surrey Essex" s = tag(s) print s for word, tag in s: if tag == "NN": # Find all nouns in the input string. print word
def extract_pos_feat(text): pos_feat = [] for article in text: pos_feat.append('~~~\n') for sent in article: token_tags = tag(sent) tags = map(lambda x: x[1], token_tags) pos_sent = START + ' '.join(tags) + END pos_feat.append(pos_sent) return pos_feat
def clean_text_by_word(text): """ Tokenizes a given text into words, applying filters and lemmatizing them. Returns a dict of word -> syntacticUnit. """ text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=True)) filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)] if HAS_PATTERN: tags = tag(join_words(original_words)) # tag needs the context of the words in the text else: tags = None units = merge_syntactic_units(original_words, filtered_words, tags) return dict((unit.text, unit) for unit in units)
def clean_text_by_word(text, language="english", deacc=False): """ Tokenizes a given text into words, applying filters and lemmatizing them. Returns a dict of word -> syntacticUnit. """ init_textcleanner(language) text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) original_words = list(tokenize(text_without_acronyms, lowercase=True, deacc=deacc)) filtered_words = filter_words(original_words) if HAS_PATTERN: tags = tag(" ".join(original_words)) # tag needs the context of the words in the text else: tags = None units = merge_syntactic_units(original_words, filtered_words, tags) return { unit.text : unit for unit in units }
def pluralify(self, bottish, english, dictionary): plural_suffix = "ly" for definition in dictionary: # if English word is a plural if tag(english)[0][1] in ("NNS", "NNPS"): # if the singular English word is already defined... if singularize(english) == definition["english"]: bottish = definition["bottish"] + plural_suffix return bottish # otherwise generate a new plural else: bottish = bottish + plural_suffix return bottish # if English word is a singular... elif tag(english)[0][1] in ("NN", "NNP"): # if a plural version is already defined... if pluralize(english) == definition["english"]: bottish = definition["bottish"].rstrip(plural_suffix) return bottish return bottish
def tagging(self, sentence): filename = "nouns.txt" filename1 = "adjectives.txt" tags = tag(sentence) #POS tagging to get the required words corresponding to the below Noun and Adjective Tags tags_n = [word for word, pos in tags if pos in ['NN','NNS','NNP','NNPS']] tags_adj = [word for word, pos in tags if pos in ['JJ','JJS','JJR']] #Dumping noun and adjective tags respectively with open(filename, 'a') as fp: pickle.dump(tags_n, fp) fp.close() with open(filename1, 'a') as fp: pickle.dump(tags_adj, fp) fp.close()
def tagLemma(self, word_old): #print tag(word_old) for word, pos in tag(word_old): if pos=="NNS": #plurales x = singularize(word) elif pos in ["VB","VBG","VBZ","VBP","VBD","VBN","MD"]: # verbos a infinitivo x = conjugate(word, INFINITIVE) #To-Do: fix this if x: # a veces da error al conjugar x = x else: x = word else: x = word return x
def sentiment(content): relevant_types = ["JJ", "VB", "RB"] # adjectives, verbs, adverbs score = 0 wordnet.sentiment.load() synsets = wordnet.synsets for word, pos in tag(content): if pos in relevant_types: try: synset = synsets(word, pos)[0].weight except KeyError: # incorrect part of speech tag continue ps, ns, os = synset score = score + (ps - ns) return 1 if score >= 0 else -1
def get(self, descriptions, mode = "link", index = False, field = False, count = False): """ Returns a list of names or connect a list of item to lexicon items Keyword arguments: descriptions --- EHRI.get() descriptions list mode --- Either link or search index --- Overide self.index field --- Field to query, default is scopeAndContent count --- Overide self.count """ if index: self.index = index if field: self.field = field if count: self.count = count results = {} #Looping on items for description in descriptions: for word, pos in tag(description[self.field]): if pos in ["NN", "NNS", "NNP"]: w = singularize(word) w = w.lower() if w not in results: results[w] = 0 results[w] += 1 # If we are looking for stats about items if mode == "link": if w.title() in self.available: for item in self.lexicon: print item if w.title() in self.lexicon[item]: self.count[item] += 1 if description[self.identifier] not in self.index: self.index[description[self.identifier]] = [] self.index[description[self.identifier]].append(item) if mode == "link": return self.index elif mode == "search": return results
def postag_feature_builder( text, target_pos=('JJ', 'NN', 'VB', 'NP', 'RB', 'CD')): """ faster version of the tag feature builder uses paten.tag instead of paten.parsetree """ if not text: return {} # tag each word try: result = patvec.count( (word for word, tag in paten.tag(text, tokenize=True, encoding='utf-8') if tag in target_pos)) except IndexError as e: print text, e result = {} return result
def clean_text_by_word(text, language="english"): """ Tokenizes a given text into words, applying filters and lemmatizing them. Returns a dict of word -> syntacticUnit. """ init_textcleanner(language) text_without_acronyms = [ replace_with_separator(text[i].text, "", [AB_ACRONYM_LETTERS]) for i in range(len(text)) ] original_sentences = [list(tokenize(text_without_acronyms[i], to_lower=True, deacc=True)) for i in range(len(text_without_acronyms))] # original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=True)) original_words = [] for i, basicSentence in enumerate(original_sentences): text[i].basic = u' '.join(basicSentence) original_words += basicSentence filtered_words = filter_words(original_words) if HAS_PATTERN: tags = tag(" ".join(original_words)) # tag needs the context of the words in the text else: tags = None units = merge_syntactic_units(original_words, filtered_words, tags) return { unit.text : unit for unit in units }
def stemming(self,tokens): text = " ".join(tokens) words = [] part_of_speech = {} part_of_speech['noun'] = ["NN"] part_of_speech['verbs'] = ["VB","VBG","VBP","VBZ","VBN","VBD"] part_of_speech['plural'] = ["NNS"] part_of_speech['adjective'] = ["JJ"] for word, pos in tag(text): if pos in part_of_speech['noun']: word = self.stemmer.stemming(word) if pos in part_of_speech["verbs"]: word = lemma(word) if pos in part_of_speech['plural']: word = singularize(word) if pos in part_of_speech['adjective']: word = self.stemmer.stemming(word) words.append(word) return words
def pos_tag_text(text): def penn_to_wn_tags(pos_tag): if pos_tag.startswith('J'): return wn.ADJ elif pos_tag.startswith('V'): return wn.VERB elif pos_tag.startswith('N'): return wn.NOUN elif pos_tag.startswith('R'): return wn.ADV else: return None tagged_text = tag(text) tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag)) for word, pos_tag in tagged_text] return tagged_lower_text
def sentiment(content): if len(wordnet.sentiment) == 0: wordnet.sentiment.load() relevant_types = ['JJ', 'VB', 'RB'] #adjectives, verbs, adverbs score = 0 synsets = wordnet.synsets for word, pos in tag(content): if pos in relevant_types: try: synset = synsets(word, pos)[0].weight except KeyError: #incorrect part of speech tag continue positivity, negativity, objectivity = synset score = score + (positivity - negativity) * (1 - objectivity) return score
def pos_tag_sentence(self, sentence, verb='^'): half_window = (self.window / 2) sentence = sentence.decode('utf8', 'replace') sentence_pos = pattern.tag(sentence) sentence_pos = [e for e in sentence_pos if e[1] in self.pos_tags] verb_pos = [v_p for v_p, e in enumerate(sentence_pos) if e[0] == verb][0] start_pos, end_pos = verb_pos - half_window, verb_pos + half_window start_pos = 0 if start_pos < 0 else start_pos word_context = sentence_pos[start_pos: end_pos] # pad the first element to form equal length vectors if len(word_context) < self.window: window_append = [word_context[0]] * (self.window - len(word_context)) window_append.extend(word_context) word_context = window_append word_context.pop(half_window) return word_context
def clean(doc): st = "" sciencestopwords = set([u'model','according', 'data', u'models', 'function', 'properties', 'approach', 'parameters', 'systems', 'number', 'order', u'data', 'analysis', u'information', u'journal', 'results','using','research', 'consumers', 'scientists', 'model', 'models', 'journal', 'researchers','paper','new','study','time','case', 'simulation', u'simulation', 'equation', 'based','years','better', 'theory', 'particular','many','due','much','set', 'studies', 'systems', 'simple', 'example','work','non','experiments', 'large', 'small', 'experiment', u'experiments', 'provide', 'analysis', 'problem', 'method', 'used', 'methods']) for (word, pos) in tag(doc): if pos in ["JJ", "NNS", "NN", "NNP"]: st = st+word+" " else: if st!= "": st = st[0:-1]+" " #print "got one" wordl = st.lower().split() s = "" for word in wordl: if word not in sciencestopwords: s = s+" "+word return s
def restructCaption(self, pred_caption, subject_set, object_set, pred_relations): total_rel = [0 for i in range(len(subject_set.keys()))] skey = list(subject_set.keys()) for_tagging_caption = pred_caption.lower() splitted_caption = for_tagging_caption.split() next_tag_of_subject_set = {} verb_noun_tag = ["VB", "VBP", "VBZ", "VBG", "VBD", "VBN", "NN"] person_tag = ["woman", "man", "people"] for subject in skey: for i in range(len(splitted_caption)): if subject == splitted_caption[i] or (subject == "person" and splitted_caption[i] in person_tag): next_word = i + 1 if next_word < len(splitted_caption): if Logging: print("[restructCaption] next_word is '%s'" % (splitted_caption[next_word])) for word, pos in tag(splitted_caption[next_word]): if Logging: print( "[restructCaption] the pos of next word is '%s'" % (pos)) if pos in verb_noun_tag: next_tag_of_subject_set[subject] = True if Logging: print( "[restructCaption] next_tag(VERB OR NOUN?) is appended: '%s'" % (subject + ' : ' + splitted_caption[next_word])) else: next_tag_of_subject_set[subject] = False break else: next_tag_of_subject_set[subject] = False else: next_tag_of_subject_set[subject] = False for i in range(len(skey)): if Logging: print("[restructCaption] the subject key is '%s'" % (skey[i])) for j in range(len(subject_set[skey[i]])): if Logging: print("[restructCaption] next_tag is '%s'" % (next_tag_of_subject_set[skey[i]])) if j == 0: # total_rel[i] = pred_relations[subject_set[skey[i]][j]][0] + ' ' + pred_relations[subject_set[skey[i]][j]][1] + ' ' + pred_relations[subject_set[skey[i]][j]][2] total_rel[i] = pred_relations[subject_set[skey[i]][j]][ 1] + ' ' + pred_relations[subject_set[skey[i]][j]][2] if len(subject_set[skey[i]] ) == 1 and next_tag_of_subject_set[skey[i]]: total_rel[i] = total_rel[i] + ' and' continue total_rel[i] = total_rel[i] + ' and ' + pred_relations[ subject_set[skey[i]][j]][1] + ' ' + pred_relations[ subject_set[skey[i]][j]][2] if next_tag_of_subject_set[skey[i]]: total_rel[i] = total_rel[i] + ' and' if Logging: print("[restructCaption] total phrase is '%s'" % (total_rel)) # Insert part final_caption = pred_caption splitted_final_caption = final_caption.split() pop_relations_index = [] for i in range(len(skey)): # final_caption = final_caption.replace(skey[i], total_rel[i]) if skey[i] == "person": if "man" in splitted_final_caption: final_caption = final_caption.replace( "man", "man" + ' ' + total_rel[i]) elif "woman" in splitted_final_caption: final_caption = final_caption.replace( "woman", "woman" + ' ' + total_rel[i]) elif "people" in splitted_final_caption: final_caption = final_caption.replace( "people", "people" + ' ' + total_rel[i]) else: index_list = subject_set[skey[i]] # print(index_list) for j in range(len(index_list)): pop_relations_index.append(index_list[j]) else: if skey[i] in splitted_final_caption: final_caption = final_caption.replace( skey[i], skey[i] + ' ' + total_rel[i]) else: index_list = subject_set[skey[i]] # print(index_list) for j in range(len(index_list)): pop_relations_index.append(index_list[j]) # print(pop_relations_index) used_relations = [] for i in range(len(pred_relations)): if i in pop_relations_index: continue else: used_relations.append(pred_relations[i]) if Logging: print("[restructCaption] final caption is '%s'" % (final_caption)) print("[restructCaption] final used relations are '%s'" % (used_relations)) # print(used_relations) return final_caption, used_relations
if inputfile.endswith('/') == False: inputfile += '/' srts = [ inputfile + f for f in os.listdir(inputfile) if f.lower().endswith('srt') ] for srt in srts: f = open(srt, 'r') for line in f: if line.find('-->') == -1: text += line f.close() text = re.sub(r'^\d+[\n\r]', '', text, flags=re.MULTILINE) tags = tag(text) pos = [t[1] for t in tags] ngrams = {} n = int(sys.argv[2]) for i in range(len(pos) - n + 1): gram = tuple(pos[i:i + n]) if gram in ngrams: ngrams[gram] += 1 else: ngrams[gram] = 1 for ngram in sorted(ngrams, key=ngrams.get, reverse=True): count = ngrams[ngram] if count > 4: print ' '.join(ngram) + ": " + str(count)
def tag(self, tokens): # don't import at top since don't want to fail if not installed from pattern.en import tag # not tokenizing ensures that the number of tagged tokens returned is # the same as the number of input tokens return tag(u' '.join(tokens), tokenize=False)
f = open(path + genreFolder + "/" + files, "r") plotText = f.read() f.close() plotText = re.sub('<!--.*?>.*?-->', '', plotText, 0, re.I | re.S) plotText = re.sub('<.+?>', '', plotText, 0, re.I | re.S) sentList = [ x.replace("\n", " ") for x in nltk.sent_tokenize(plotText.replace("\t", "")) ] for strSentence in sentList: #print(strSentence) for word, pos in tag(strSentence): if pos in ("VB", "VBD", "VBG", "VBN", "VBP", "VBZ"): word = str(lemma(word)) if (word not in ("be", "do", "let", "begin", "have", "try", "start")): verbList.append(word) a = parse(strSentence, relations=True, lemmata=True) #pprint(a) sentence = Sentence(a) for i in range(0, len(sentence.verbs) - 1): strVP = str(' '.join(sentence.verbs[i].lemmata)) vpList.append(strVP) #print(sentence.relations)
# By default, unknown words are tagged NN and then improved with a set of rules. # light=False uses Brill's lexical and contextual rules, # light=True uses a set of custom rules that is less accurate but faster (5x-10x). # The output is a string with each sentence on a new line. # Words in a sentence have been annotated with tags, # for example: fork/NN/I-NP/I-PNP # NN = noun, NP = part of a noun phrase, PNP = part of a prepositional phrase. print s print # Prettier output can be obtained with the pprint() command: pprint(s) print # The string's split() method will (unless a split character is given), # split into a list of sentences, where each sentence is a list of words # and each word is a list with the word + its tags. print s.split() print # The tag() command returns a list of (word, POS-tag)-tuples. # With light=True, this is the fastest and simplest way to get an idea # of a sentence's constituents: s = "I eat pizza with a fork." s = tag(s, light=True) print s for word, tag in s: if tag == "NN": # Find all nouns in the input string. print word
(first iteration) 1st line: contain 9 syllables 2nd line: contain 8 syllables 3rd line: contain 7 syllables ... 9th line: contain 1 syllable """ from pattern.en import parsetree from pattern.en import tag from pattern.en import pprint def word_eval(string) pprint(parsetree(string, relations = True)) for word, pos in tag(string): if pos == "NN": print word def gutenberg_text_gather(current_URL): from pattern.web import * buddhist_psalm_text = URL(current_URL).download() print buddhist_psalm_text # Save data to a file (will be part of your data fetching script) f = open('buddhist_psalm_text.pickle','w') pickle.dump(all_texts,f) f.close() # Load data from a file (will be part of your data processing script)
visualize_sentence_tree(t) from nltk.corpus import treebank_chunk data = treebank_chunk.chunked_sents() train_data = data[:4000] test_data = data[4000:] print train_data[7] simple_sentence = 'the quick fox jumped over the lazy dog' from nltk.chunk import RegexpParser from pattern.en import tag tagged_simple_sent = tag(simple_sentence) print tagged_simple_sent chunk_grammar = """ NP: {<DT>?<JJ>*<NN.*>} """ rc = RegexpParser(chunk_grammar) c = rc.parse(tagged_simple_sent) print c chink_grammar = """ NP: {<.*>+} # chunk everything as NP }<VBD|IN>+{ """
# By default, unknown words are tagged NN and then improved with a set of rules. # light=False uses Brill's lexical and contextual rules, # light=True uses a set of custom rules that is less accurate but faster (5x-10x). # The output is a string with each sentence on a new line. # Words in a sentence have been annotated with tags, # for example: fork/NN/I-NP/I-PNP # NN = noun, NP = part of a noun phrase, PNP = part of a prepositional phrase. print s print # Prettier output can be obtained with the pprint() command: pprint(s) print # The string's split() method will (unless a split character is given), # split into a list of sentences, where each sentence is a list of words # and each word is a list with the word + its tags. print s.split() print # The tag() command returns a list of (word, POS-tag)-tuples. # With light=True, this is the fastest and simplest way to get an idea # of a sentence's constituents: s = "I eat pizza with a fork." s = tag(s) print s for word, tag in s: if tag == "NN": # Find all nouns in the input string. print word
print lemma('running') print conjugate('purred', '3sg') print PAST in tenses('purred') # 'p' in tenses() also works. print(PAST, 1, PL) in tenses('purred') print 'Quantification' print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken']) print quantify('carrot', amount=90) print quantify({'carrot': 100, 'parrot': 20}) print 'ngrams' print ngrams("I am eating a pizza.", n=2) #parse s = parse('I eat pizza with a fork.') pprint(s) #tag for word, t in tag('The cat felt happy.'): print word + ' is ' + t s = "The movie attempts to be surreal by incorporating various time paradoxes, but it's presented in such a ridiculous way it's seriously boring." print sentiment(s) print polarity(s) print subjectivity(s) #The modality() function returns a value between -1.0 and +1.0, expressing the degree of certainty s2 = "Some amino acids tend to be acidic while others may be basic." # weaseling se = Sentence(parse(s, chunks=False, lemmata=True)) print modality(se)
from pattern.web import Twitter from pattern.en import tag from pattern.vector import KNN, count twitter, knn = Twitter(), KNN() for i in range(1, 3): for tweet in twitter.search('#win OR #fail', start=i, count=100): s = tweet.text.lower() p = '#win' in s and 'WIN' or 'FAIL' v = tag(s) v = [word for word, pos in v if pos == 'JJ'] # JJ = adjective v = count(v) # {'sweet': 1} if v: knn.train(v, type=p) print(knn.classify('sweet potato burger')) print(knn.classify('stupid autocorrect'))
count += 1 except: nevermind = 1 writer.close() print "[+] saved to files. Saved", count, "words." # ---------------------------- rawjson = open("archetypes_settings_corpora.json").read( ) #puts the file as a big string into the variable rawjson data = json.loads( rawjson) #json.loads take a string and turns it into a data structure for elem in data["settings"]: w = elem["name"] pos = tag(w)[-1][1] # print "-"*20 # print w, pos add_word(w, pos) if pos.startswith("VB") and Word(w).lemmatize('v') is not w: w = Word(w).lemmatize('v') pos = tag("to " + w)[-1][1] # print "-"*5 # print w, pos add_word(w, pos) if pos.startswith("NN") and Word(w).lemmatize('n') is not w: w = Word(w).lemmatize('n') pos = tag(w)[-1][1] # print "-"*5 # print w, pos