def drivel(noun): """ Generates drivel by shifting nouns in the description of the shifted noun, and prepending random alliterative adjectives. """ drivel = [] description = shift(noun)[1] description = description.split(";")[0] for sentence in parsetree(description): for i, w in enumerate(sentence.words): w, tag = w.string, w.tag if tag in ("VBD", "VBZ"): w = conjugate(w, "infinitive") w = conjugate(w, "past") if tag == "NN": # noun try: w = shift(w)[0] a = list(alliterate(w)) if a: if i > 0 and sentence.words[i].tag == "JJ": # adjective drivel.pop() drivel.append(choice(a)) except: pass drivel.append(w) return " ".join(drivel)
def negate_verb_leaves(leaf_nodes): """ Takes a list of leaf nodes and tries to negate it by searching for "not" TODO: This code could be cleaner. It's repeated all over the place. """ idx = None for i in xrange(len(leaf_nodes)): leaf = leaf_nodes[i] # We have to call is_leaf because sometimes we have # non-verb nodes treated as leaves if is_leaf(leaf) and leaf.label() == 'RB' and leaf[0].lower() == 'not': idx = i break if idx: negative = leaf_nodes positive = leaf_nodes[:idx] + leaf_nodes[idx+1:] else: positive = leaf_nodes if len(leaf_nodes) == 1 and conjugate(leaf_nodes[0][0], 'VB') != 'be': pos = leaf_nodes[0].label() negative = [ Tree(pos, [conjugate('do', pos)]), Tree('RB', ['not']), Tree('VB', [conjugate(leaf_nodes[0], 'VB')]) ] else: negative = [leaf_nodes[0], Tree('RB', ['not'])] + leaf_nodes[1:] return (positive, negative)
def cj(word, num): ''' Conjugate verb based on count ''' if num != 1: word = conjugate(word, 'pl') # Conjugate as plural else: word = conjugate(word, '3sg') # Conjugate in 3rd-person singular return word
def testBasic(): from pattern.en import referenced print referenced('hour') from pattern.en import conjugate, lemma, lexeme print lexeme('purr') print lemma('purring') print conjugate('purred', '3sg') # he / she / it
def find_verb_form(original_form, original_lemma, new_lemma): """ Figure out original tense of the verb, then apply that tense to new_lemma There might be more than one, let's keep it simple and just apply the first one """ possible_conjugations = tenses(original_form) if len(possible_conjugations) > 1: return conjugate(new_lemma, possible_conjugations[1]) else: return conjugate(new_lemma, possible_conjugations[0])
def explore_syn_tree(self, word_tuple, pos=None): if self.check_in_words(word_tuple[0]): return word_tuple[0] if word_tuple[0] == "difficult": return "hard" if word_tuple[0] == "saturn": return "up goer" synsets = None posSet = False if pos is None: synsets = wn.synsets(word_tuple[0]) else: synsets = wn.synsets(word_tuple[0], pos=pos) posSet = True for synset in synsets: matching_hypernym = self.explore_hypernyms(synset) if matching_hypernym != None: if pos == wordnet.NOUN: if self.is_plural(word_tuple[0]): # The word is a plural, yo. return self.inflect_engine.plural(matching_hypernym) if pos == wordnet.VERB: pattern_tag = self.get_pattern_tense(word_tuple[1]) person = 1 if word_tuple[1] == "VBZ": person = 3 matching_hypernym = conjugate(matching_hypernym, tense=pattern_tag, person=person, parse=True) return matching_hypernym return matching_hypernym matching_hyponym = self.explore_hyponyms(synset) if matching_hyponym != None: if pos == wordnet.NOUN: if self.is_plural(word_tuple[0]): # The word is a plural, yo. return self.inflect_engine.plural(matching_hyponym) if pos == wordnet.VERB: pattern_tag = self.get_pattern_tense(word_tuple[1]) person = 1 if word_tuple[1] == "VBZ": person = 3 matching_hypornym = conjugate(matching_hyponym, tense=pattern_tag, person=person, parse=True) return matching_hyponym return matching_hyponym # Some handling logic wn_pos = word_tuple[1] possible_solutions = set() for tagged_tuple in self.tagged_words: if wn_pos == tagged_tuple[0][1]: possible_solutions.add(tagged_tuple[0][0]) return random.sample(possible_solutions, 1)[0]
def stem_word(word): try: if word.endswith("s"): if singularize(word) in nltk_words: return singularize(word) else: return word if word.endswith("d") or word.endswith('ing'): if conjugate(word) in nltk_words: return conjugate(word) else: return word except: return word return word
def dictionary_tag(sentence): client = MongoClient('127.0.0.1') db = client.dictionaries collection = db.sfu emots = db.emoticons for word in sentence: qu = emots.find_one({'word':word[0].encode('utf-8').strip(),'pos':'em'}) if qu: word[1] = 'em' word.append(qu['polarity']) continue qu = word[0].encode('utf-8').strip().lower() qu = re.sub(":"," ",qu) if 'n\'t' in qu or 'not' in qu or qu == 'not' or qu == 'n\'t' or qu == 'no': word[1] = 'neg' word.append("") continue single = None single = collection.find_one({'word':qu,'pos':word[1][:2].lower()}) if single: #word as it is word[1] = single['pos'] word.append(single['polarity']) else: #convert to 1 person present tense qu = conjugate(qu,'1sg') single = collection.find_one({'word':qu,'pos':word[1][:2].lower()}) if single: word[1] = single['pos'] word.append(single['polarity']) else: word.append('') return sentence
def create_description(self): pat = 'VB|VBD|VBZ|VBG * NN IN * NN' #pat = 'PRP * VB|VBD|VBZ|VBG * NN' phrases = search.search_out(self.source_text, pat) conjugated_phrases = [] for phrase in phrases: words = [] for word, pos in tag(phrase): if pos in ["VBZ", "VBD", "VB", "VBG"]: words.append(conjugate(word, "3sg")) #elif pos == "NN" and random.random() < .1: #words.append(self.define_word(word)) else: words.append(word) conjugated_phrases.append(' '.join(words)) artifacts = list(self.artifacts) sentence_prefixes = ["The present invention", "The device", "The invention"] paragraph_prefixes = ["The present invention", "According to a beneficial embodiment, the invention", "According to another embodiment, the device", "According to a preferred embodiment, the invention", "In accordance with an alternative specific embodiment, the present invention"] i = 0 self.description = '' for phrase in conjugated_phrases: line = "" if i == 0: line = paragraph_prefixes[0] + " " + phrase else: if random.random() < .1: line = "\n\n" + random.choice(paragraph_prefixes) + " " + phrase else: line = random.choice(sentence_prefixes) + " " + phrase self.description += line + ". " i += 1
def test_conjugate(self): # Assert different tenses with different conjugations. for (v1, v2, tense) in ( ("be", "be", en.INFINITIVE), ("be", "am", en.PRESENT_1ST_PERSON_SINGULAR), ("be", "are", en.PRESENT_2ND_PERSON_SINGULAR), ("be", "is", en.PRESENT_3RD_PERSON_SINGULAR), ("be", "are", en.PRESENT_PLURAL), ("be", "being", en.PRESENT_PARTICIPLE), ("be", "was", en.PAST_1ST_PERSON_SINGULAR), ("be", "were", en.PAST_2ND_PERSON_SINGULAR), ("be", "was", en.PAST_3RD_PERSON_SINGULAR), ("be", "were", en.PAST_PLURAL), ("be", "were", en.PAST), ("be", "been", en.PAST_PARTICIPLE), ("had", "have", "inf"), ("had", "have", "1sg"), ("had", "have", "2sg"), ("had", "has", "3sg"), ("had", "have", "pl"), ("had", "having", "part"), ("has", "had", "1sgp"), ("has", "had", "2sgp"), ("has", "had", "3sgp"), ("has", "had", "ppl"), ("has", "had", "p"), ("has", "had", "ppart"), ("imaginerify", "imaginerified", "3sgp")): self.assertEqual(en.conjugate(v1, tense), v2) print "pattern.en.conjugate()"
def _transform_word(self, word, pos, less, more): """transforms a word to be less less and more more :param word: word to transform :type word: str :param pos: part of speech of the word :type pos: str :param less: list of 'less' words :type less: list :param more: list of 'more' words :type more: list :returns: transformed word :rtype: str """ new_word = self._get_similar_word(word, less, more) new_pos = en.tag(new_word)[0][1] if (pos[:2] != new_pos[:2]) or word == new_word: return word # handle noun if pos.startswith('NN'): # pluralization if pos.endswith('S') and not new_pos.endswith('S'): new_word = en.pluralize(new_word) elif not pos.endswith('S') and new_pos.endswith('S'): new_word = en.singularize(new_word) # capitalization if word[0].isupper(): new_word = new_word[0].upper() + new_word[1:] else: new_word = new_word.lower() # handle verb elif pos.startswith('VB'): tense, person, number = en.tenses(word)[0][:3] # conjugation conjugated = en.conjugate(new_word, tense=tense, person=person, number=number, parse=False) if conjugated is not None: new_word = conjugated # remove underscores for joint words new_word = new_word.replace('_', ' ') return new_word
def verbConjugate(lemma, rel, aan): relAvoid = ["/r/CapableOf", "/r/PartOf", "/r/MemberOf" "/r/IsA", "/r/HasA", "/r/TranslationOf", "/r/HasProperty"] if not rel in relAvoid: s = parsetree(lemma, relations=True) try: vb = s[0].verbs[0].words[0].string result = lemma.replace(vb, conjugate(vb, "part")) except: result = lemma else: if vb == "to": result = lemma # if not aan: # try: # firstWord = s[0].chunks[0].words[0].string # reconjugated = conjugate(firstWord, "part") # result = lemma.replace(firstWord, reconjugated) # except: # result = lemma else: result = lemma return result
def make_thesaurus(file_path): """ Returns dict of counters 'thesaurus', where thesaurus[word] = { synonym1: 4, syn2: 8, syn3: 1, ... } """ thesaurus = defaultdict(lambda: Counter()) with open(file_path, "r") as f: for line in f: # Ignore repeated book title headers if _is_title(line): continue parsed = parse(line) for tagged_word in parsed.split()[0]: word = tagged_word[0].strip().lower() pos = tagged_word[1][0] # get pos for word # Reject non-ASCII characters try: word = word.decode("ascii") except (UnicodeDecodeError, UnicodeEncodeError): continue # Reject whitespace character if re.match("^[\s]*$", word): continue # Increment word count of word w thesaurus[word].update([word]) # Retrieve syn = synonym[w], add to thesaurus[syn] for syn in wn.get_synonyms(word): syn = syn.name().split(".")[0] # if noun, add plural form if word is plural, else add singular if pos == "N": if word == pluralize(word): thesaurus[pluralize(syn)].update([word]) else: thesaurus[syn].update([word]) # if verb, conjugate synonyms to the right form before adding them to thes elif pos == "V": word_tenses = tenses(word) if word_tenses: thesaurus[conjugate(syn, tense=word_tenses[0][0])].update([word]) else: thesaurus[syn].update([word]) else: thesaurus[syn].update([word]) # Update thesaurus with mappings, if map_file exists file_path = file_path.replace(config.CORPUS_FOLDER, config.MAPPING_FOLDER) map_file = file_path.replace(config.CORP_TAG, config.MAP_TAG) thesaurus = _add_mappings(map_file, thesaurus) return thesaurus
def get_conjugations(lem): vforms = [] if lemma(lem) == 'be': vforms = [i for i in EXCEPTIONS] else: for ta in TENSE_ASPECTS: c = conjugate(lemma(lem), ta) vforms.append( c +'|||'+ ta) return vforms
def generalize(self, words): # Convert to generalized sentence (not company specific) we = 'We__PRP' words[0] = we # Pluralize the verb using pattern library pl = conjugate(self.remove_pos_tag(words[1]), 'pl') + '__VBP' words[1] = pl return words
def deconjugate_leaves(T): if T.height() == 1: return if T.height() == 2: if T.label()[0] == "V": T.set_label("VB") T[0] = conjugate(T[0], "VB") else: for i in T: deconjugate_leaves(i)
def uncompleteSentence(self, sentence): full_sentence = nltk.word_tokenize(' '.join(sentence)) tags = nltk.pos_tag(full_sentence) new_sentence = [] for i in range(len(full_sentence) - 1): isVerb = True synsets = wordnet.synsets(tags[i][0]) for syn in synsets: if 'verb.' not in syn.lexname: isVerb = False break if ('NN' == tags[i][1] or tags[i][1] == 'RB') and isVerb: new_sentence.append(conjugate(tags[i][0], 'part')) elif tags[i][1] == 'JJ' and isVerb: new_sentence.append(conjugate(tags[i][0], 'ppart')) else: new_sentence.append(tags[i][0]) new_sentence.append(full_sentence[-1]) return new_sentence
def process(wrd): tmp = '' ignore_pos = ['IN', 'RP', 'TO'] exception_lemma = ['flatter', 'flattered'] if tag(wrd)[0][1] in ignore_pos: tmp = wrd elif any(wrd in ex_l for ex_l in exception_lemma): tmp = wrd else: tmp = conjugate(wrd, tense=PAST) return tmp
def make_verbs(inp_fn, out_dir): verbs = set() with open(inp_fn) as f: for i, line in enumerate(f): verbs.add(conjugate(line.strip(), 'p')) print '\tFound', len(verbs), 'verbs' out_fn = out_dir + '/verbs.txt' with open(out_fn, 'w') as f: for n in verbs: f.write(n + '\n') return
def test_parse_lemma(self): # Assert the accuracy of the verb lemmatization algorithm. # Note: the accuracy is higher (95%) when measured on CELEX word forms # (presumably because en.inflect.VERBS has high percentage irregular verbs). i, n = 0, 0 for v in en.inflect.VERBS.infinitives: for tense in en.inflect.VERBS.TENSES: if en.inflect._parse_lemma(en.conjugate(v, tense)) == v: i += 1 n += 1 self.assertTrue(float(i) / n > 0.88) print "pattern.en.inflect._parse_lemma()"
def random_imperative(noun=None, get_related=True, verb=None, adj=None): if noun: n = get_related_or_not(noun, True, 'NN') if get_related else noun else: n = random.choice(NOUNS) if verb: v = get_related_or_not(verb, True, 'VB') if v is None: v = verb else: v = random.choice(VERBS) if not adj: adj = random.choice(ADJS) if coin_flip(0.5) else '' c = '' if coin_flip(0.7): n = pluralize(n) c = random.choice(C2) else: i = random.randint(1, 5) n = quantify(adj + ' ' + n, amount=i) adj = '' if coin_flip(0.25): a = '' v = conjugate(v) elif coin_flip(0.33): v = conjugate(v, 'part') # present participle a = random.choice(A1) c = random.choice(C) elif coin_flip(0.5): v = conjugate(v) a = random.choice(A2) else: v = conjugate(v) a = random.choice(A3) phrase = '{0} {1} {2} {3} {4}'.format(a, v, c, adj, n) phrase = phrase[1:] if phrase.startswith(' ') else phrase return re.sub(' +', ' ', phrase)
def create_defined_template(arr): for x in arr: x = x.splitlines()[0] if x[2] == "P": for t in tenses: a = conjugate(x[3:], t) triggers[x[0]][x[:3]].append(str(a)) elif x[3:].find(" ") != -1 and x[3:] not in triggers_bi: triggers_bi.append(x[3:]) elif x[:3] in ["BPA", "BPE"] and x[3:] not in triggers_p: triggers_p.append(x[3:]) else: triggers["OTHERS"].append(x[3:])
def generate_phrase_1(): selections = [VERBS, ADJECTIVES, NOUNS, ADVERBS, TRANSITIVE_VERBS, VERBS, ADJECTIVES, NOUNS] entropy = sum([log(len(item), 2) for item in selections]) conjugations = ['part', None, None, None, [random_item_from_list([PAST, PRESENT])], 'part', None, None] entropy += 1 print('%.2f bits of entropy' % entropy) sub_list = [random_item_from_list(item) for item in selections] for idx, word in enumerate(sub_list): if conjugations[idx]: sub_list[idx] = conjugate(word, *conjugations[idx]) return ('the %s %s %s %s %s the %s %s %s' % tuple(sub_list)).replace('_', ' ')
def generate_phrase_2(): '''Return a phrase and its entropy (in bits) of the form (# adj noun) (adverb verb) (adjective noun punctuation) E.g., 17 MODERATE TRAYS At once live outed wORTH bOSSES ''' selections = [ADJECTIVES, NOUNS, ADVERBS, TRANSITIVE_VERBS, ADJECTIVES, NOUNS, TERMINAL_PUNCTUATION] entropy = sum([log(len(item), 2) for item in selections]) conjugations = [None, None, None, [random_item_from_list([PAST, PRESENT]), 3, PLURAL], None, None, None] sub_list = [random_item_from_list(item) for item in selections] for idx, word in enumerate(sub_list): if conjugations[idx]: sub_list[idx] = conjugate(word, *conjugations[idx]) entropy += 1 sub_list[1] = pluralize(sub_list[1]) sub_list[5] = pluralize(sub_list[5]) entropy += log(997, 2) for idx, item in enumerate(sub_list): rnd = randint(4) if rnd == 1: sub_list[idx] = item.capitalize() if rnd == 2: sub_list[idx] = item.upper() if rnd == 3: sub_list[idx] = item[0] + item[1:].upper() entropy += 2 phrase = ('%i %s %s %s %s %s %s%s' % tuple([randint(997) + 2] + sub_list)).replace('_', ' ') # Insert a random symbol into the sentence insert_point = randint(len(phrase) + 1) entropy += log(len(phrase) + 1, 2) + log(len(SYMBOLS), 2) phrase = phrase[:insert_point] + random_item_from_list(SYMBOLS) + phrase[insert_point:] insert_point = randint(len(phrase) + 1) entropy += log(len(phrase) + 1, 2) + log(len(SYMBOLS), 2) phrase = phrase[:insert_point] + random_item_from_list(SYMBOLS) + phrase[insert_point:] return phrase, entropy
def _interface(sentence,edblist): target_words, word_pre, person_taggers, org_taggers = _Stem(sentence, edblist) token_list =[] #import pdb; pdb.set_trace() #print "word_pre:", word_pre if len(word_pre) > 0: word_pre[0] = word_pre[0][0].upper() + word_pre[0][1:] #import pdb; pdb.set_trace() for word in word_pre: #import pdb; pdb.set_trace() tokens = {} #if word == "He": # is a person, subject? # tokens[word] = ["He", "She"] if word.strip().lower() == person_taggers.strip().lower(): tokens[word] = [word, "He", "She"] #tokens[word] = [ "They"] elif word.strip().lower() == org_taggers.strip().lower(): if _isplural(org_taggers.strip().split()[-1]) or (org_taggers.strip().split()[-1] == 'they'): tokens[word] = [word, "They"] else: tokens[word] = [word, "It"] #tokens[word] = [ "It"] # pass else: if lmtzr.lemmatize(word) not in target_words: token_list.append(word) else: r_sent = [] candidates = Generate_candidates_topN(word,sentence,19,edblist) for i in range(len(candidates)): r_sent.append(candidates[i] + "@" + sentence.replace(word,candidates[i])) sub_top10 = kenlm_topn(r_sent,9,sentence) if lmtzr.lemmatize(word) not in sub_top10: sub_top10.insert(0,word) if len(tenses(word)) > 0: _sub_top10 = [] for w in sub_top10: _sub_top10.append(conjugate(w, tenses(word)[0][0], 3)) tokens[word] = _sub_top10 else: tokens[word] = sub_top10 if tokens: token_list.append(tokens) return token_list
def verb_extract(child_speech): tokenized = [nltk.word_tokenize(line) for line in child_speech] tokenized = list(itertools.chain.from_iterable(tokenized)) tokens = [token for token in tokenized if re.match("^[A-Za-z.]*$", token)] verbs = [] if TAGGER == "senna" or TAGGER == "nltk": tokens = ' '.join(tokens).split('.') tagged = [[verb.lower() for (verb,POS) in liste if POS.startswith('VB')] for \ liste in [nltk.pos_tag(nltk.word_tokenize(token)) for token in tokens]] verbs = filter(None, tagged) verbs = [item for sublist in verbs for item in sublist] elif TAGGER == "spacy": tokens = ' '.join(tokens) tokens = TAG(unicode(tokens), entity=False) tokens = zip(tokens, [str(tok.tag_) for tok in tokens]) verbs = [str(verb).lower() for (verb, pos) in tokens if pos.startswith('VB')] verbs = [(verb, STEM(verb).encode("utf8")) for verb in verbs] verbs = [(verb, stem) for (verb, stem) in verbs if not (verb in EXCLUSIONS or \ not conjugate(stem, tense="infinitive") or \ is_no_change(stem) or \ conjugate(stem, tense="infinitive") in LIGHT or \ not in_vocabulary(conjugate(stem, tense="infinitive")) or \ conjugate(stem, tense="past").endswith("ed"))] return verbs
def tagLemma(self, word_old): #print tag(word_old) for word, pos in tag(word_old): if pos=="NNS": #plurales x = singularize(word) elif pos in ["VB","VBG","VBZ","VBP","VBD","VBN","MD"]: # verbos a infinitivo x = conjugate(word, INFINITIVE) #To-Do: fix this if x: # a veces da error al conjugar x = x else: x = word else: x = word return x
def processAction(statement,database_name = DATABASE_NAME): #raise Exception(statement) match = re.search(r"what happens (?:(?:if)|(?:when)) (?:the)? ([\s\w]+) ([\s\w]+?) ([\s\w]+)\??",statement) #raise Exception(match.group(0)) if match: # need to search action table for subj = match.group(1) verb = match.group(2) verb = conjugate(verb,tense='infinitive') obj = match.group(3) result = queryTable("actions",{"origin":subj,"ident":verb,"target":obj},database_name) if result == None: return "Sorry, I don't what happens when " + subj + " " + verb + " " + obj result = queryTable("reactions",{"origin":obj,"action":verb},database_name) (table,thing) = findTableContainingEntityWithIdentOrName(obj, database_name) return thing[0] + " says " + result['name']
def add_qualifier(phrase): n = random.choice(NOUNS) v = random.choice(VERBS) if coin_flip(0.5): a = 'cannot' else: a = 'can' if coin_flip(0.5): b = 'of' else: b = 'for the' n = pluralize(n) v = conjugate(v) qual = '{0} you {1} {2}'.format(n, a, v) return '{0} {1} {2}'.format(phrase, b, qual)
def verbConjugate(lemma, rel, aan): relAvoid = ["/r/CapableOf", "/r/PartOf", "/r/MemberOf" "/r/IsA", "/r/HasA", "/r/TranslationOf", "/r/HasProperty"] if not rel in relAvoid: s = parsetree(lemma, relations=True) try: vb = s[0].verbs[0].words[0].string result = lemma.replace(vb, conjugate(vb, "part")) except: result = lemma else: if vb in ["to", "can"]: result = lemma # if not aan: # try: # firstWord = s[0].chunks[0].words[0].string # reconjugated = conjugate(firstWord, "part") # result = lemma.replace(firstWord, reconjugated) # except: # result = lemma else: result = lemma # NEW PART TO ADD ARTICLES TO BARE NOUN PHRASES FROM CONCEPTNET # try: # nns = s[0].subjects + s[0].objects # except: # pass # else: # if nns: # for nn in nns: # if not startsWithCheck(nn.string, ['a', 'an', 'the', 'your', 'his', 'her', 'its']): # result = result.replace(nn.string, a_or_an(nn.string)) # else: # pass return result
"""Transformations for extracting data.""" from pattern.en import (conjugate, PRESENT, INDICATIVE, PROGRESSIVE) import spacy from . import base # TODO: remove the following lines once this issue: # https://github.com/clips/pattern/issues/243, is resolved. try: conjugate('give') except: pass class GerundifyingTransformer(base.LoggedCallable): """Transform the input into a gerund phrase.""" _nlp = spacy.load('en', disable=['ner']) @classmethod def _is_root(cls, token: spacy.tokens.token.Token) -> bool: return token.dep_ == 'ROOT' and token.pos_ == 'VERB' @classmethod def _is_rootlike(cls, token: spacy.tokens.token.Token) -> bool: return (cls._is_root(token) or any(cls._is_root(c) for c in token.conjuncts)) @classmethod def _conjugate(cls, text: str) -> str:
def toPresent(verb): return conjugate(verb, PRESENT)
def failure_description_ngram_detect(sentences): #stop_word_to_investigae = ['to','is' ,'are' , 'not' , 'need' , 'reported' ,'seem' ,'seems' ,'appear' ,'appears'] stop_word_to_investigae = ['is', 'are', 'not', 'to', 'cannot'] for stop_word in stop_word_to_investigae: stopwords_2 = copy.deepcopy(Utility.stopwords_nltk_pattern_custom) if stop_word in stopwords_2: stopwords_2.remove(stop_word) phrases = Phrases( sentences, max_vocab_size=max_vocab_size, min_count=bigram_minimum_count_threshold, threshold=threshold, common_terms=frozenset(stopwords_2), delimiter=delimiter, progress_per=progress_per ) # use # as delimiter to distinguish from ~ used in previous stages with open(save_folder_name + '/' + stop_word + '_bigrams.txt', "w") as bigram_2_file: c = 1 for key in phrases.vocab.keys(): #if key not in Utility.stopwords: if key not in Utility.stopwords_nltk_pattern_custom: flag = True a = key.decode() a = a.split("#") if len(a) > 1: if stop_word not in a: # or ('not' not in a and 'be' not in a) : flag = False if a[0] != stop_word: #only look for n-grams starting with the stop-word flag = False if stop_word == 'to' and 'be' not in a: #if stop_word is 'to', only look for bigram that also has 'bo'. flag = False for w in a[ 1:]: # go through the rest of the list, and see if positional word are there if w in Utility.List_of_positional_word: flag = False if flag: # aspell_checker is the original n grams delimited by # if stop_word == 'to': last_word = a[-1] conjugated_last_word = conjugate(last_word) if conjugated_last_word in Utility.List_of_maintenance_verb: logger.info("action word found: " + conjugated_last_word) s = '~'.join(a) if s not in List_of_maintenance_action_ngram: List_of_maintenance_action_ngram.append( s) continue #skip the rest code, so that it is not writen into the file if stop_word == 'is' or stop_word == 'are': w = a[1:] ngram_without_is_are = delimiter.decode().join( w) if ngram_without_is_are not in List_of_failure_description_ngram_without_is_are: List_of_failure_description_ngram_without_is_are.append( ngram_without_is_are) if len(w) == 1 and w[ 0] not in List_of_failure_description_single_word: List_of_failure_description_single_word.append( w[0]) s = key.decode() print('{0}\t\t{1:<30}\t\t{2:<10}'.format( c, s, phrases.vocab[key]), file=bigram_2_file) c += 1 with open( "./Input_Output_Folder/Failure_Description/List_of_failure_description_ngram_without_is_are.txt", "w") as words_file: for index_no, w in enumerate( List_of_failure_description_ngram_without_is_are): print('{0}\t\t{1:<10}'.format(index_no, w), file=words_file) with open( "./Input_Output_Folder/Failure_Description/List_of_failure_description_single_word.txt", "w") as words_file: for index_no, w in enumerate( sorted(List_of_failure_description_single_word)): print('{0}\t\t{1:<10}\t\t{2:<10}'.format( index_no, w, normalized_token_freq_dict[w]), file=words_file) with open( "./Input_Output_Folder/Failure_Description/List_of_maintenance_action_ngram.txt", "w") as words_file: for index_no, w in enumerate(sorted(List_of_maintenance_action_ngram)): print('{0}\t\t{1:<10}'.format(index_no, w), file=words_file)
def conjugateVerb(self, verb, tense): conjugated = conjugate(verb, tense) return conjugated
def extractFeaturesAndWriteBio(READ_PATH, file_type): global ALL_poems, bio, cnt for subdir, dirs, files in os.walk(READ_PATH): for file in files: num_of_files = len(files) - 1 # deduct the DS_store #print (num_of_files,'readDirectory',READ_PATH) if file_type in file and 'readme' not in file: # ID id = file.split(".")[0] print "\nID:", id.split("_")[1] filenames.append(id) cnt += 1 # print('') # print('') # print('OPENED:',id) # print('') # print('') poem_replaced = "" replacement_word = "" previous_replacement_word = "" author = "" titles = "" title = "" new_title = "" replaced_ls = [] new_titles_ls = [] quit_language = 0 oscillator = 0 word_cnt = 0 # if EXCEPTION is raised... do not add to html SKIP_bool = False ########################## # Load POEM TEXT FILE # ########################## txt_fn_path = DATA_DIR + READ_TXT_PATH + id.split( "_")[1] + ".txt" print "txt_fn_path:", txt_fn_path if os.path.isfile(txt_fn_path) and cnt > 0: txt_data = open(txt_fn_path).read() # http://blog.webforefront.com/archives/2011/02/python_ascii_co.html # txt_data.decode('ISO-8859-2') .decode('utf-8') # unicode(txt_data) author = txt_data.split("****!****")[0].strip(' \t\n\r') title = txt_data.split("****!****")[1].strip(' \t\n\r') bio = txt_data.split("****!****")[2] #.strip(' \t\n\r') ###### CLEAN BIO bio.replace("\t", "	") bio.replace("\n", " <br>") bio.replace("\r", " <br>") poem_replaced = bio #print poem_replaced ############################### # REPLACE AUTHOR NAME in poem ############################## author_ln = author.split(" ")[-1].lstrip() author_fn = author.split(" ")[:-1] author = " ".join(n for n in author_fn) + author_ln # #poem_replaced = poem_replaced.replace(author_ln,"Jhave") ####################### # replace BOOK TITLES ####################### #print "TITLES"] new_title = getNewTitle("title").encode('utf-8') ####################### # fake AUTHOR ####################### new_author = " ".join( random.choice(authors).split(" ") [1:-2]) + " " + random.choice(authors).split(" ")[-2] #print "new AUTHOR",new_author ############################ # replace years with another ############################ for w1 in poem_replaced.split("("): for w2 in w1.split(")"): if w2 is not None and w2.isdigit(): new_num = random.randint( int(w2) - 5, int(w2) + 5) #print "REPLACING #:",w2,new_num poem_replaced = poem_replaced.replace( w2, str(new_num)) replaced_ls.append(new_num) ################# # Load JSON # ################# response = loadJSONfile(READ_JSON_PATH + "poetryFoundation_" + id.split("_")[1] + "_Alchemy_JSON.txt") if response != "failed": if response.get('entities') is not None: for idx, entity in enumerate(response['entities']): #print idx ce = entity['text'].replace("0xc2", " ") ce = ce.replace("0xe2", "'") ce = re.sub( '(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, ce) ce = ce.encode('utf-8') try: content = ce.decode('utf-8').encode( 'ascii', 'xmlcharrefreplace') except UnicodeDecodeError: "AAAARGGGGHHH!!!!" if content in poem_replaced: ################################################ # Replace similar entities from other JSON # ################################################ replacement_entity = findSimilarEntityinRandomJSON( content, entity['type']) cr = re.sub( '(' + '|'.join( import_utilities.chars.keys()) + ')', import_utilities.replace_chars, replacement_entity) poem_replaced = poem_replaced.replace( content, replacement_entity) replaced_ls.append(replacement_entity) ########################## # POS REPLACMENT # ########################## token_tuples = nltk.word_tokenize(poem_replaced) tt = nltk.pos_tag(token_tuples) ################# # ADJECTIVES # ################# for i in tt: if "/i" not in i[0] and len( i[0]) > 3 and i[0] != "died": origw = re.sub( '(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, i[0]) origw = import_utilities.strip_punctuation(origw) if i[1] == 'JJ': JJr = random.choice(JJ) # # JJr = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, JJr) # JJr = import_utilities.strip_punctuation(JJr) JJr = import_utilities.moveBeginAndEndPunctuationFromStrToString( i[0], JJr.lstrip().lstrip()) if i[0].istitle(): JJr = JJr.title() poem_replaced = re.sub( r'\b' + import_utilities.strip_punctuation(i[0]) + r'\b', JJr, poem_replaced, 1) #poem_replaced.replace(i[0],JJr,1) replaced_ls.append(JJr) if i[1] == 'RB': RBr = random.choice(RB) RBr = import_utilities.moveBeginAndEndPunctuationFromStrToString( i[0], RBr.lstrip().lstrip()) if i[0].istitle(): RBr = RBr.title() poem_replaced = re.sub( r'\b' + import_utilities.strip_punctuation(i[0]) + r'\b', RBr, poem_replaced, 1) replaced_ls.append(RBr) ######################## # IS IT ENGLISH? # ######################## for line in poem_replaced.split('\n\r'): if len(line) > 0: if "english" not in import_utilities.get_language( line): quit_language += 1 #print "NOT english:",quit_language,line else: quit_language -= 1 ######################### # SYNSET REPLACE # ######################### for idx, word in enumerate(poem_replaced.split(' ')): if "<br>" not in word and "	" not in word and len( word) > 0: ######################### # PRONOUN ' VERB # ######################### if len(word.split("'")) > 1: if word.split("'")[0] in personal_pronouns: replacement_word = random.choice( personal_pronouns) + "'" + word.split( "'")[1] + ' ' poem_replaced.replace(word, replacement_word) #print "word,",word,"replacement_word:",replacement_word #################################################### # Replacement of OTHERs # #################################################### elif not word.lower().strip( " \n\t\r") in stopwords.words('english'): # take off leading brackets, commas etc... word_punct_nopunct = import_utilities.strip_punctuation_bool( word) word_nopunct = word_punct_nopunct[ 'word'].strip(" \n\t\r") word_punct = word_punct_nopunct['punct'] punct_bool = word_punct_nopunct['punct_bool'] ####################################################### # MAIN EXCHANGE PROCESS CALL >>>>>>> GET THE SYNSET # ####################################################### if word_nopunct[-4:].lower() == "here": similarterm = random.choice( import_utilities.heres) else: #print "WORD:",word_nopunct if len(word_nopunct) > 3: oscillator = oscillator + 1 ############################################ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # STYLE SWITCH..... should in future use POS # ... i.e. if noun & oscillator%3, do... # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ############################################ # synset similarterm = import_utilities.synset_creeley( word_nopunct) #print "synset", similarterm if similarterm is not None and similarterm == word_nopunct and len( word_nopunct) > 5: RESERVOIR.sort(key=len) similarterm = RESERVOIR[ idx % len(RESERVOIR)] print idx, len( RESERVOIR ), similarterm, word_nopunct, "PRE>>>>>>>>LAST CHANGE STOP: ", word, "~", similarterm ############################################ # manually get rid of some terrible choices ############################################ naw_terms = ["mind", "lonely"] if similarterm == "ilk": ##print "like" similarterm = "like" if similarterm == "Nox": ##print "like" similarterm = "nite" if similarterm == "ope": ##print "doth" similarterm = "does" if similarterm == "information technology": ##print "doth" similarterm = "it" if similarterm == "velleity": ##print "doth" similarterm = "want" if similarterm == "Crataegus laevigata": ##print "doth" similarterm = "may" if similarterm == "eff": ##print "doth" similarterm = "know" if similarterm == "naw": ##print "doth" similarterm = "mind" ####################################### # abbreviations for f*****g states! # ####################################### if word_nopunct.upper( ) in import_utilities.state_abbrev and word_nopunct.lower( ) not in stopwords.words( 'english') and "me," not in word: tmp = similarterm if word_nopunct == "oh": similarterm = random.choice( import_utilities.exclaims) else: similarterm = random.choice(RESERVOIR) #print word_nopunct," replaced by", tmp, "replaced with:",similarterm, "in:",line ############## # hyphenated # ############## hyp = word.split("-") #print word,len(hyp) if len(hyp) > 1: similarterm = "" for w in hyp: if len(w) > 2: if import_utilities.synset_creeley( w) is not None: similarterm += import_utilities.synset_creeley( w) + "-" else: similarterm += w + "-" similarterm = import_utilities.strip_underscore( similarterm[:-1]) #print "hyphenated:",word,"replaced by: "+similarterm # ######################################################### # # is it a TRUNCATED VERB slang as in singin or wishin # # ######################################################### # if similarterm == word_nopunct and len(word)>2 and 'in' in word_nopunct[-2:]: # similarterm = import_utilities.synset_creeley(word_nopunct+'g') # ## #print "TRUNCATED SLANG word: '"+word+"'",similarterm # interim = import_utilities.lemma(similarterm) # ## #print interim # similarterm = import_utilities.conjugate(interim, tense=import_utilities.PARTICIPLE, parse=True)[:-1] # # # # #print word,"widx:",widx," line_pos_tags[widx][0]:",line_pos_tags[widx][0]," line_pos_tags[widx][1]:",line_pos_tags[widx][1] ################# # SWEAR WORD # ################# ##print "at the garden of if:", word if word_nopunct in import_utilities.curses: similarterm = random.choice( import_utilities.curses) ##print "SWEAR WORD word: '"+word+"'",similarterm #print "SIMILAR:",similarterm if similarterm is not None: if len(hyp) > 1: replacement_word = similarterm else: replacement_word = word.replace( word_nopunct, similarterm) replacement_word = import_utilities.strip_underscore( replacement_word) replacement_word = import_utilities.replaceNumbers( replacement_word) ######################### # RESERVOIR_OF_WEIRDNESS # ######################### if word_nopunct.lower( ) in import_utilities.impera: replacement_word = random.choice( import_utilities.impera) #print word,"IMPERA:",replacement_word elif word_nopunct.lower( ) in import_utilities.conjuncts: replacement_word = random.choice( import_utilities.conjuncts) #print word," CONJUNCTION replaced with",replacement_word elif word_nopunct.lower( ) in import_utilities.indef_prono: replacement_word = random.choice( import_utilities.indef_prono) #print word," INDEF_prono replaced with",replacement_word elif word_nopunct.lower( ) in import_utilities.prepo: replacement_word = random.choice( import_utilities.prepo) #print word," prepo replaced with",replacement_word elif word_nopunct.lower( ) in import_utilities.rel_prono: replacement_word = word #print word," rel_prono LEAVE alone: ",replacement_word elif word_nopunct.lower()[-2:] == "ly": if import_utilities.synset_creeley( word) is not None: replacement_word = import_utilities.strip_underscore( import_utilities.synset_creeley( word)) #(word[:-2]) #print word," ADVERB: ",replacement_word # if replacement_word[-2:] !="ly": # replacement_word +="ly" else: if len( hyp ) < 2 and "like" not in word_nopunct and import_utilities.singularize( word_nopunct ) == import_utilities.singularize( replacement_word ) and word_nopunct.lower( ) not in import_utilities.stopwords_ls: if word not in RESERVOIR and quit_language < 0 and import_utilities.countPunctuation( word ) < 1 and len( word_nopunct ) > 3 and not word_nopunct.istitle(): #print "ADDING",word,"to reservoir" ############################ # ADDING ONLY SMALL WORDS ############################ if len(word) < 7: RESERVOIR.append(word) replacement_word = random.choice( rap_mouth) # RESERVOIR) #print word_nopunct,"replaced from reservoir with", replacement_word # print "'"+word_nopunct+"' vs RESERVOIR replacement_word:",replacement_word #," new_line:",new_line if quit_language > 1 and not word_nopunct.istitle( ): #print quit_language, "Probably foreign language: make a word salad in english" replacement_word = random.choice( rap_mouth) #RESERVOIR) #print word_nopunct,"OTHER replaced from reservoir with", replacement_word ################################################### # MOST REPLACEMENT occurs here... # ################################################### poem_ls = poem_replaced.split(' ') idx = poem_ls.index(word) # print idx,",", poem_ls[idx],",", word ,",",replacement_word #print word ," --- ",previous_replacement_word,replacement_word if len(word) > 3 and replacement_word.lstrip( ).rstrip() == word_nopunct.lstrip().rstrip(): # try alchemy? # a RESERVOIR.sort(key=len) replacement_word = RESERVOIR[ idx % len(RESERVOIR)] print idx, len( RESERVOIR ), "LAST CHANGE STOP: ", word, "~", replacement_word try: if poem_ls[ idx] == word and "****" not in word and "." != word and "\n" not in word: poem_ls[ idx] = replacement_word #.encode('utf-8') poem_replaced = " ".join(poem_ls) # store this word so that conjugation can be checked previous_replacement_word = replacement_word except Exception, e: #print "PENULTIMATE SKIP_bool replace FAIL",e SKIP_bool = True continue ########################################################################### # testing Pattern.en as parser for conjugation and article replacement # # much more robust than my hand-coded hacks # ########################################################################### # correct CONJUGATion of paticiple verbs with pattern.en parsed = parse(poem_replaced, tags=True) pre_verbal = ["'m", "'s", "'re"] for idx, p in enumerate(parsed.split(" ")): tok = p.split("/")[0] typ = p.split("/")[1] #print idx,tok,typ if tok in pre_verbal: #print "pre_verbal:",tok next_word = parsed.split(" ")[idx + 1].split("/") # try try try for ix, n in enumerate(next_word): next_word[ix] = re.sub( '(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, n).encode('utf-8') try: #print next_word,next_word[0],next_word[1][:2] # if it's a verb that follows if next_word[1][:2] == "VB": before_verb = " ".join( w for w in poem_replaced.split(" ") [:idx]) #.encode('utf-8') after_verb = " ".join( w for w in poem_replaced.split(" ") [idx + 1:]) #.encode('utf-8') new_verb = conjugate( next_word[0], tense=PARTICIPLE, parse=True).encode('utf-8') # insert new #print "CONJUGATION needed, changing:",poem_replaced.split(" ")[idx],"to",parsed.split(" ")[idx],poem_replaced.split(" ")[idx-1]+" "+new_verb poem_replaced = before_verb + " " + new_verb + " " + after_verb except Exception, e: # print "INside parsed COnjugation loop",e continue # correct ARTICLES for idx, word in enumerate(poem_replaced.split(" ")): if len(word) > 0 and idx != 0 and " " not in word: # A or AN if poem_replaced.split(" ")[idx - 1].lower( ) == "a" or poem_replaced.split(" ")[ idx - 1].lower() == "an": #print word,"---",article(word)+" "+word before_article = " ".join( w for w in poem_replaced.split(" ")[:idx - 1]) after_article = " ".join( w for w in poem_replaced.split(" ")[idx + 1:]) new_conj = referenced(word) # capitalize if poem_replaced.split(" ")[idx - 1].istitle(): new_conj = new_conj.split(" ")[0].title( ) + " " + new_conj.split(" ")[1] poem_replaced = before_article + " " + new_conj + " " + after_article ######################### # WRITE SINGLE POEM # ######################### if not SKIP_bool: tmp_poem = "" # poem_replaced.replace("\t","	") # poem_replaced.replace("\n"," <br>") # poem_replaced.replace("\r"," <br>") HTML_poem = "" for line in poem_replaced.split("\n"): #print "LINE", line HTML_poem += line + "<br>" if len(response) > 0 and len(id.split("_")) > 1: # ALL_poems = ALL_poems_intro + " ".join(i for i in ALL_poems.split("</h2>.")[0:])+"<br><br>~~~~~~~~~~~~~~~~~~~~~~~~~~<br>[ A poem generated from template : <b>"+ author+"</b>, <i>"+ title +"</i> ]<br><br><b>"+new_title+"<br><br></b>"+HTML_poem try: ALL_poems = "<br>[ A generated-poem based upon: <i>" + title + "</i> by <b>" + author + "</b>]<br><br><i>" + new_title + "</i><br> by <b>" + new_author + "</b><br>" + HTML_poem + ALL_poems.split( "</h2>")[1].replace(" ", " ") tmp_poem = "[A generated-poem based upon: '" + title + "' by " + author + "]\n\n" + new_title + "\nby " + new_author + "\n" + poem_replaced print "\n~~~\n\n" + tmp_poem #print "\nORIGINAL:",bio txt_fn = id.split("_")[1] + "_POEMs.txt" WRITE_BIO_PATH = DATA_DIR + "generated/POEMS/POEMS_" + datetime.datetime.now( ).strftime('%Y-%m-%d_%H') + "/" if not os.path.exists(WRITE_BIO_PATH): os.makedirs(WRITE_BIO_PATH) txt_fn_path = WRITE_BIO_PATH + txt_fn f_txt = open(txt_fn_path, 'w') f_txt.write(tmp_poem) #.encode('utf-8')) f_txt.close() #print "\nTXT file created at:",txt_fn_path ####### # write them all.... wasteful... but useful if run is interrupted.... ########### # if cnt==1: # ALL_poems = ALL_poems_intro+ALL_poems # else: ALL_poems = ALL_poems_intro + ALL_poems.replace( " ", " ") ALL_poems = ALL_poems.replace( "$$datetime$$", datetime.datetime.now().strftime( '%Y-%m-%d at %H:%M')) ALL_poems = ALL_poems.replace( "$$cnt$$", str(cnt)) print "cnt", cnt ALL_poems = ALL_poems.replace( "$$gentime$$", str(time.time() - start_time)) # ALL POEMS txt_fn = datetime.datetime.now().strftime( '%Y-%m-%d_%H' ) + "_poetryFoundation_generatedPOEMS_CREELEYstyle_" + type_of_run + ".html" txt_fn_path = DATA_DIR + "generated/POEMS/" + txt_fn f_txt = open(txt_fn_path, 'w') f_txt.write(ALL_poems + "</hmtl>") f_txt.close() print "\nTXT file created at:", txt_fn_path except Exception, e: print "At the final LOOP", e continue else: print "~! EMPTY response:", author else: cnt = cnt - 1
def _find_verbs(self, word): word_bigrams = [(a[0], b[0]) for a, b in self.word_tag_pairs if a[0] == word.name and a[1] == 'NOUN' and b[1] == 'VERB' and en.conjugate(b[0], "inf") not in ('be', 'have')] return self.__get_best_collocations(word, word_bigrams)
def change_tense_fixed(text, tense): if text == 'leave' and tense == PAST: return 'left' if text == 'quit' and tense == PAST: return 'quit' else: return conjugate(text, tense)