def create_claims(self): independent = '{0}. {1} for {2}, comprising:' dependent = '{0}. {1} of claim {2}, wherein said {3} comprises {4}.' self.claims = [] claim_number = 0 for prefix in self.prefixes: claim_number += 1 claim = independent.format( claim_number, referenced(prefix).capitalize(), self.partial_title) terms = random.sample(list(self.artifacts), random.randint(2, min(len(self.artifacts), 5))) for term in terms[:-1]: claim += '\n\t' + referenced(term) + '; ' claim += 'and \n\t' + referenced(terms[-1]) + '.' self.claims.append(claim) independent_claim_number = claim_number for term in terms: claim_number += 1 claim = dependent.format( claim_number, referenced(prefix, article=DEFINITE).capitalize(), independent_claim_number, term, random.choice(self.unformatted_illustrations)) self.claims.append(claim)
def processArt(t): # lis=nltk.pos_tag(nltk.word_tokenize(s)) # lis=x.split(" ") l = [] cnt = 0 for ele in t: cnt = cnt + 1 # y=ele[1] # word=ele[0] # if ele in Articles: # if superlative(wnl.lemmatize(t[cnt]))==t[cnt]: # l=l+["the"] if ele == "a": if referenced(t[cnt]) == "a": l = l + [joinlist(["a", "the"])] else: l = l + [joinlist(["an", "the"])] elif ele == "an": if referenced(t[cnt]) == "an": l = l + [joinlist(["an", "the"])] else: l = l + [joinlist(["a", "the"])] else: l = l + [ele] return " ".join(l)
def create_claims(self): independent = '{0}. {1} for {2}, comprising:' dependent = '{0}. {1} of claim {2}, wherein said {3} comprises {4}.' self.claims = [] claim_number = 0 for prefix in self.prefixes: claim_number += 1 claim = independent.format(claim_number, referenced(prefix).capitalize(), self.partial_title) terms = random.sample( list(self.artifacts), random.randint(2, min(len(self.artifacts), 5))) for term in terms[:-1]: claim += '\n\t' + referenced(term) + '; ' claim += 'and \n\t' + referenced(terms[-1]) + '.' self.claims.append(claim) independent_claim_number = claim_number for term in terms: claim_number += 1 claim = dependent.format( claim_number, referenced(prefix, article=DEFINITE).capitalize(), independent_claim_number, term, random.choice(self.unformatted_illustrations)) self.claims.append(claim)
def testBasic(): from pattern.en import referenced print referenced('hour') from pattern.en import conjugate, lemma, lexeme print lexeme('purr') print lemma('purring') print conjugate('purred', '3sg') # he / she / it
def _on_match_rewrite(matcher, doc, i, matches, pattern): match_id, start, end = matches[i] # get meta information for this on-match function rule_pattern = PatternMeta( before=self.normalize_pattern(pattern.before), after=self.normalize_pattern(pattern.after) ) prev_text, after_text = doc[:start].text.strip(), doc[end:].text.strip() # keep all the orth in the pattern. between_tokens = [get_str_from_pattern(r) for r in rule_pattern.after] # fill in all the other None for idx, _ in enumerate(between_tokens): # if between_tokens[idx]: # already filled in # continue # check what OP it is. for op in self.ops: # idx is the idx in the toIdxes # make sure this is in the right count of words. # Should be because we are using one word a time if op.toIdxes[0] <= idx and op.toIdxes[1] > idx and \ op.fromIdxes[1] - op.fromIdxes[0] == op.toIdxes[1] - op.toIdxes[0]: # before, it is from the start to the fromIdx 0 # offset of the idx w.r.t op.toIdxes[0], then add the offset to the fromIdx before_idx = idx - op.toIdxes[0] + op.fromIdxes[0] before_doc_idx = before_idx + start # then the after idx. after_idx = idx #+ op.toIdxes[0] if before_idx < 0 or before_idx >= op.fromIdxes[1] or \ after_idx < 0 or after_idx >= op.toIdxes[1] or \ before_doc_idx < 0 or before_doc_idx >= len(doc): continue between_tokens[idx] = match_super(doc[before_doc_idx].text, change_matched_token_form( # form change a_token=doc[before_doc_idx], a_pattern=rule_pattern.before[before_idx], b_pattern=rule_pattern.after[after_idx])) break # fill in all the other None for idx, _ in enumerate(between_tokens): if between_tokens[idx]: # already filled in continue # add appropriate DET if 'POS' in rule_pattern.after[idx] and rule_pattern.after[idx]['POS'] == 'DET': if idx < len(between_tokens) - 1 and between_tokens[idx+1]: between_tokens[idx] = referenced(between_tokens[idx+1]).split()[0] elif (start + idx - 1 < len(doc)) and \ (start + idx - 1 >= 0): between_tokens[idx] = referenced(doc[start + idx - 1].text).split()[0] # noqa: E501 if not None in between_tokens: generated_text = ' '.join([prev_text] + between_tokens + [after_text]).strip() return (match_id, generated_text) return None
def p12(): p = "{noun1} stood in the middle of the {noun2}, surrounded by {noun3} of {adjective} {nouns}.".format( noun1 = referenced(random.choice(ns)), noun2 = random.choice(ns), verbed = random.choice(verbed), noun3 = referenced(random.choice(ns)), adjective = random.choice(adjs), nouns = pluralize(random.choice(ns)) ) if random.random() < 0.25: p = look_around_you(p) return capitalize(p)
def p16(): p = "the {noun1} was {adjective1} {noun2}, divided into {adjective2} {nouns1} by {quant} of {adjective3} {color} {nouns2}.".format( noun1 = random.choice(ns), adjective1 = referenced(random.choice(adjs)), noun2 = random.choice(ns), adjective2 = random.choice(adjs), nouns1 = pluralize(random.choice(ns)), quant = referenced(random.choice(quants)), adjective3 = random.choice(ns), color = random.choice(colors), nouns2 = pluralize(random.choice(ns)) ) return capitalize(p)
def p5(): p = "the {noun1} was surrounded by {quant} of {noun2} {nouns}, {adjective} as {noun3}.".format( noun1 = random.choice(ns), quant = referenced(random.choice(quants)), noun2 = random.choice(ns), nouns = pluralize(random.choice(ns)), adjective = random.choice(adjs), noun3 = referenced(random.choice(ns)) ) if random.random() > 0.6: p = random.choice([look_around_you(p), get_lost(p)]) elif random.random() > 0.8: p = suddenly(p) return capitalize(p)
def indef(word): if word[0] in SYMB_FOR_INDEF_AN: return 'an {0}'.format(word) else: return referenced( word) if word not in REFERENCED_EXCEPTIONS else '{0} {1}'.format( REFERENCED_EXCEPTIONS[word], word)
def prefix(self): prefixes = ["system", "method", "apparatus", "device"] self.prefixes = random.sample(prefixes, 2) title = self.prefixes[0] + " and " + self.prefixes[1] + " for " if random.random() < .2: title = "web-based " + title return referenced(title)
def make_noun_string(np, plural=False): # random chance of removing modifier #if random.random() < 0.5: # np[0] == '' # common mass nouns if np[1] in ['data', 'information', 'children', 'people', 'stuff', 'equipment']: return ' '.join(np).strip() elif any(np[1].lower().startswith(x) for x in ('every', 'any', 'some')) or np[1] in ('nothing', 'nobody'): return np[1] quantifiers = ['many', 'few', 'several', 'various', 'multiple', 'fewer', 'more'] if np[0] in quantifiers: return np[0] + ' ' + pluralize(np[1]) else: die_roll = random.random() if die_roll < 0.15 or plural: return ' '.join((np[0], pluralize(np[1]))).strip() elif die_roll < 0.25: return random.choice(('his', 'her', 'their', 'your')) + ' ' + ' '.join(np).strip() elif random.random() < 0.45: return referenced(' '.join(np).strip()) else: return 'the ' + ' '.join(np).strip()
def get_random_word(t, ref=False): """Return a random word from a set filtering on lexname category if necessary""" # If there are entries in the lexnames list for the given POS tag, limit results to that, # otherwise just return a random word for that POS word = None if len(POS[t]['lexnames']): lexname = '' while lexname not in POS[t]['lexnames']: word = random.choice(POS[t]['words'])[0] synset = wordnet.synsets(get_singular(word), pos=t) if synset: lexname = synset[0].lexname else: word = random.choice(POS[t]['words'])[0] # If required, prefix with an article if ref: word = referenced(word) return word.lower()
def ct(word, num, use_a=False): ''' Return a string counting + pluralizing, if necessary, the word ''' # Switch the numeral 1 to 'a' if the use_a flag is set if num == 1 and use_a: # Auto-choose "a" or "an" based on word return referenced(word) return '{0} {1}'.format(num, pl(word, num))
def opening(): p = "it was {adjective} and {adjectively} {noun}.".format( adjective=referenced(random.choice(adjs)), adjectively=random.choice(adjeys), noun=random.choice(ns) ) return capitalize(p)
def describe(synset): names = synset.lemma_names name = referenced(names[0]) definition = synset.definition.strip() string = "{0} is {1}".format(name, definition) if len(names) > 1: string += ', also known as {0}'.format(enum_or(names[1:])) return sentence(string)
def specify(synset, hyponyms): name = referenced(synset.lemma_names[0]) string = name hyp_names = [s.lemma_names[0] for s in hyponyms] if len(hyp_names) == 1: string += ' can, more specifically, be a {0}'.format(hyp_names[0]) else: string += ' is either {0}. '.format(enum_or(hyp_names)) return sentence(string)
def p3(): p = "{pronoun} seated in {noun}, surrounded by {nouns1} and {nouns2}.".format( pronoun = capitalize(PRONOUN) + " " + VERB, noun = referenced(random.choice(ns)), nouns1 = pluralize(random.choice(ns)), nouns2 = pluralize(random.choice(ns)) ) return capitalize(p)
def add_sentence(noun, adjective, nutsness=10): ''' Create a new sentence. Nutsness will define the chance on generating strange additions with pattern's drivel(). This is awesome. Input: String noun, String adjective, integer nutsness Output: String sentence ''' nuts = 10 - nutsness n = noun.split() if randint(0, nuts) != 0: # return a ridiculous sentence sentence = '. It is {}'.format(referenced(adjective + drivel(n[-1]))) else: # return a boring sentence sentence = '. This is a {} place'.format(referenced(adjective)) return sentence
def create_abstract(self): artifacts = search.hypernym_combo(self.source_text, 'artifact', "JJ NN|NNS") #artifacts +=search.hypernym_combo(self.source_text, 'material', "JJ NN|NNS") artifacts = set(artifacts) self.artifacts = artifacts words = [] words = [referenced(w) for w in artifacts] self.abstract = self.title + ". " self.abstract += "The devices comprises " self.abstract += ", ".join(words)
def test_indefinite_article(self): # Assert "a" or "an". for article, word in (("an", "hour"), ("an", "FBI"), ("a", "bear"), ("a", "one-liner"), ("a", "European"), ("a", "university"), ("a", "uterus"), ("an", "owl"), ("an", "yclept"), ("a", "year")): self.assertEqual(en.inflect.indefinite_article(word), article) self.assertEqual(en.article("heir", function=en.INDEFINITE), "an") self.assertEqual(en.referenced("ewe"), "a ewe") print "pattern.en.article()"
def clean_text(self, words): new_words = words.split(' ') doc = self.nlp(words) first_word_POS = doc[0].pos_ if first_word_POS == 'VERB': new_words[0] = conjugate(new_words[0], tense=PARTICIPLE) if first_word_POS == 'NOUN' or first_word_POS == 'ADJ': if new_words[0] != 'a' or new_words[0] != 'an': new_words[0] = referenced(new_words[0]) elif first_word_POS == 'NUM' and len(new_words) > 1: new_words[1] = pluralize(new_words[1]) return ' '.join(new_words)
def processArt(t): # lis=nltk.pos_tag(nltk.word_tokenize(s)) # lis=x.split(" ") l = [] cnt = 0 for ele in t: cnt = cnt + 1 # y=ele[1] # word=ele[0] if ele == "a": if referenced(t[count] == "a"): l = l + [joinlist(["a", "the"])] else: l = l + [joinlist(["an", "the"])] elif ele == "an": if referenced(t[count] == "an"): l = l + [joinlist(["an", "the"])] else: l = l + [joinlist(["a", "the"])] else: l = l + [ele] return " ".join(l)
def formats(self, phrase): doc = self.nlp(phrase) first_word_POS = doc[0].pos_ tokens = phrase.split(' ') new_tokens = tokens.copy() new_phrases = [] # original new_phrases.append(' '.join(new_tokens)) # with indefinite article if first_word_POS == 'NOUN' or first_word_POS == 'ADJ': new_tokens[0] = referenced(tokens[0]) new_phrases.append(' '.join(new_tokens)) # with definite article if first_word_POS == 'NOUN' or first_word_POS == 'ADJ': new_tokens[0] = "the "+tokens[0] new_phrases.append(' '.join(new_tokens)) # as gerund if first_word_POS == 'VERB': new_tokens[0] = conjugate(tokens[0], tense=PARTICIPLE) new_phrases.append(' '.join(new_tokens)) if len(tokens) > 1: if tokens[1] == 'to' and len(tokens) > 2: new_tokens[2] = referenced(tokens[2]) else: new_tokens[1] = referenced(tokens[1]) new_phrases.append(' '.join(new_tokens)) new_tokens[0] = tokens[0] new_phrases.append(' '.join(new_tokens)) # account for numbers if first_word_POS == 'NUM' and len(tokens) > 1: new_tokens[1] = pluralize(tokens[1]) new_phrases.append(' '.join(new_tokens)) return new_phrases
def p6(): p = "the {adjective1} sound of {noun1} pushed itself into the room, disturbing the silence like {adjective2} {noun2}.".format( adjective1 = random.choice(adjs), noun1 = random.choice(ns), adjective2 = referenced(random.choice(adjs)), noun2 = random.choice(ns) ) if random.random() < 0.25: p = meanwhiler(p) elif random.random() < 0.4: p = random.choice([look_around_you(p), get_lost(p)]) elif random.random() < 0.6: p = suddenly(p) return capitalize(p)
def p4(): p = "the {noun1} was {verbing} {closer}, like {quant} of {adjective} {noun2}.".format( noun1 = random.choice(ns), verbing = random.choice(["creeping", "crawling", "sneaking", "slithering"]), closer = random.choice(["closer", "nearer"]), quant = referenced(random.choice(quants)), adjective = random.choice(adjs), noun2 = random.choice(ns) ) if random.random() < 0.25: p = meanwhiler(p) elif random.random() < 0.4: p = random.choice([look_around_you(p), get_lost(p)]) return capitalize(p)
def get_flowers(self): """Returns a textual representation of carried flowers""" temp = "" if self.FLOWERS: temp += "I left with a beautiful bouquet of flowers that contained: " for f in self.FLOWERS: if f == self.FLOWERS[-1] and len(self.FLOWERS) > 1: temp += ", and " elif f != self.FLOWERS[0]: temp += ", " temp += referenced(f['color'] + " " + f['flower']) return temp + ".\n"
def test_indefinite_article(self): # Assert "a" or "an". for article, word in ( ("an", "hour"), ("an", "FBI"), ("a", "bear"), ("a", "one-liner"), ("a", "European"), ("a", "university"), ("a", "uterus"), ("an", "owl"), ("an", "yclept"), ("a", "year")): self.assertEqual(en.inflect.indefinite_article(word), article) self.assertEqual(en.article("heir", function=en.INDEFINITE), "an") self.assertEqual(en.referenced("ewe"), "a ewe") print "pattern.en.article()"
def optionByproducts(opt): """Given a concept node that represents a verb phrase, figure out what new items will be established in the scene after carrying out that phrase. Print each item.""" dirObj = findDirectObject(opt['label']) new_things = [dirObj] if dirObj else [] effects = query(opt['term'], 'Causes') created = query(opt['term'], 'CreatedBy', reverse=True) if effects and conceptOptions(effects['term']): new_things.append(effects['term']) if created and conceptOptions(created['term']): new_things.append(created['term']) new_things = [attemptSingularization(t) for t in new_things] for t in new_things: print("There is now {} in the scene.".format( en.referenced(termToReadable(t)))) return new_things
def generate_phrase(self): adj = choice([a for a in self.blackboard.pool.comparisons if len(self.blackboard.pool.comparisons[a]) > 0]) parser = ChartParser(self.grammar) gr = parser.grammar() phrase = self.produce(gr, gr.start()) noun = choice(list(self.blackboard.pool.comparisons[adj])) noun.name = en.singularize(noun.name) article = en.referenced(noun.name).split(" ")[0] replace_words = {'adj': adj, 'n': noun, 'det': article} for pos in replace_words: while pos in phrase: try: phrase = self.replace_pos( pos, replace_words[pos], phrase) except: return for w in phrase: if not isinstance(w, Word): phrase[phrase.index(w)] = Word(w) return phrase
def do_animal(self, i, j): """Process finding an animal""" # Get a random animal and give it a name animal = random.choice(self.JSON['animals']) name = random.choice(self.JSON['names']) # Print that info self.TEMP += "There was " + referenced(animal) + " there. " self.TEMP += "I named it " + name + "." # Put a square on the map to denote finding an animal here self.IMAGE.filledRectangle((i * 15 + 4, j * 15 + 4), (i * 15 + 11, j * 15 + 10), self.COLORS['red']) # Did the animal follow the narrator? if random.randrange(100) < 10: self.ANIMALS.append({'name': name, 'animal': animal}) self.get_animal_concepts(animal) self.TEMP += " It started following me." self.TEMP += "\n" self.THEN = False
def make_noun_string(np, plural=False): # random chance of removing modifier #if random.random() < 0.5: # np[0] == '' # common mass nouns if np[1] in [ 'data', 'information', 'children', 'people', 'stuff', 'equipment' ]: return ' '.join(np).strip() elif any(np[1].lower().startswith(x) for x in ('every', 'any', 'some')) or np[1] in ('nothing', 'nobody'): return np[1] quantifiers = [ 'many', 'few', 'several', 'various', 'multiple', 'fewer', 'more' ] if np[0] in quantifiers: return np[0] + ' ' + pluralize(np[1]) else: die_roll = random.random() if die_roll < 0.15 or plural: return ' '.join((np[0], pluralize(np[1]))).strip() elif die_roll < 0.25: return random.choice( ('his', 'her', 'their', 'your')) + ' ' + ' '.join(np).strip() elif random.random() < 0.45: return referenced(' '.join(np).strip()) else: return 'the ' + ' '.join(np).strip()
def indef(word): if word[0] in SYMB_FOR_INDEF_AN: return 'an {0}'.format(word) else: return referenced(word) if word not in REFERENCED_EXCEPTIONS else '{0} {1}'.format(REFERENCED_EXCEPTIONS[word], word)
def extractFeaturesAndWriteBio(READ_PATH,file_type): global ALL_poems,bio,cnt for subdir, dirs, files in os.walk(READ_PATH): for file in files: num_of_files = len(files)-1 # deduct the DS_store #print (num_of_files,'readDirectory',READ_PATH) if file_type in file and 'readme' not in file: # ID id=file.split(".")[0] #print "\nID:",id.split("_")[1] filenames.append(id) cnt+=1 # print('') # print('') # print('OPENED:',id) # print('') # print('') poem_replaced = "" replacement_word = "" previous_replacement_word = "" author="" titles="" title="" new_title="" replaced_ls =[] new_titles_ls = [] quit_language=0 oscillator=0 word_cnt=0 # if EXCEPTION is raised... do not add to html SKIP_bool=False ########################## # Load POEM TEXT FILE # ########################## ## # PAUSE ## #time.sleep(5) txt_fn_path = DATA_DIR + READ_TXT_PATH + id.split("_")[1]+".txt" #print "txt_fn_path:",txt_fn_path if os.path.isfile(txt_fn_path) and cnt>0: txt_data=open(txt_fn_path).read() # http://blog.webforefront.com/archives/2011/02/python_ascii_co.html # txt_data.decode('ISO-8859-2') .decode('utf-8') # unicode(txt_data) author=txt_data.split("****!****")[0].strip(' \t\n\r') title=txt_data.split("****!****")[1].strip(' \t\n\r') bio=txt_data.split("****!****")[2]#.strip(' \t\n\r') ###### CLEAN BIO bio.replace("\t","	") bio.replace("\n"," <br>") bio.replace("\r"," <br>") poem_replaced=bio #print poem_replaced ############################### # REPLACE AUTHOR NAME in poem ############################## author_ln=author.split(" ")[-1].lstrip() author_fn=author.split(" ")[:-1] author = " ".join(n for n in author_fn)+author_ln # #poem_replaced = poem_replaced.replace(author_ln,"Jhave") ####################### # replace BOOK TITLES ####################### #print "TITLES"] new_title = getNewTitle("title").encode('utf-8') ####################### # fake AUTHOR ####################### new_author= " ".join(random.choice(authors).split(" ")[1:-2])+" "+random.choice(authors).split(" ")[-2] #print "new AUTHOR",new_author ############################ # replace years with another ############################ for w1 in poem_replaced.split("("): for w2 in w1.split(")"): if w2 is not None and w2.isdigit(): new_num = random.randint(int(w2)-5,int(w2)+5) #print "REPLACING #:",w2,new_num poem_replaced = poem_replaced.replace(w2,str(new_num)) replaced_ls.append(new_num) ################# # Load JSON # ################# response = loadJSONfile(READ_JSON_PATH+"poetryFoundation_"+id.split("_")[1]+"_Alchemy_JSON.txt") if response != "failed": if response.get('entities') is not None: for idx,entity in enumerate(response['entities']): #print idx ce = entity['text'].replace("0xc2"," ") ce = ce.replace("0xe2","'") ce = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, ce) ce = ce.encode('utf-8') try: content = ce.decode('utf-8').encode('ascii', 'xmlcharrefreplace') except UnicodeDecodeError: "AAAARGGGGHHH!!!!" if content in poem_replaced: ################################################ # Replace similar entities from other JSON # ################################################ replacement_entity = findSimilarEntityinRandomJSON(content,entity['type']) cr = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, replacement_entity) poem_replaced = poem_replaced.replace(content,replacement_entity) replaced_ls.append(replacement_entity) ########################## # POS REPLACMENT # ########################## token_tuples = nltk.word_tokenize(poem_replaced) tt = nltk.pos_tag(token_tuples) ################# # ADJECTIVES # ################# for i in tt: if "/i" not in i[0] and len(i[0])>3 and i[0] != "died": origw = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, i[0]) origw =import_utilities.strip_punctuation(origw) if i[1]=='JJ' : JJr = random.choice(JJ) # # JJr = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, JJr) # JJr = import_utilities.strip_punctuation(JJr) JJr = import_utilities.moveBeginAndEndPunctuationFromStrToString(i[0],JJr.lstrip().lstrip()) if i[0].istitle(): JJr = JJr.title() poem_replaced = re.sub(r'\b' + import_utilities.strip_punctuation(i[0]) + r'\b', JJr, poem_replaced,1)#poem_replaced.replace(i[0],JJr,1) replaced_ls.append(JJr) if i[1]=='RB': RBr = random.choice(RB) RBr = import_utilities.moveBeginAndEndPunctuationFromStrToString(i[0],RBr.lstrip().lstrip()) if i[0].istitle(): RBr = RBr.title() poem_replaced = re.sub(r'\b' + import_utilities.strip_punctuation(i[0]) + r'\b', RBr, poem_replaced,1) replaced_ls.append(RBr) ######################## # IS IT ENGLISH? # ######################## for line in poem_replaced.split('\n\r'): if len(line)>0 : if "english" not in import_utilities.get_language(line): quit_language+=1 #print "NOT english:",quit_language,line else: quit_language-=1 ######################### # SYNSET REPLACE # ######################### for idx,word in enumerate(poem_replaced.split(' ')): if "<br>" not in word and "	" not in word and len(word)>0: ######################### # PRONOUN ' VERB # ######################### if len(word.split("'"))>1: if word.split("'")[0] in personal_pronouns: replacement_word = random.choice(personal_pronouns)+"'"+word.split("'")[1]+' ' poem_replaced.replace(word,replacement_word) #print "word,",word,"replacement_word:",replacement_word #################################################### # Replacement of OTHERs # #################################################### elif not word.lower().strip(" \n\t\r") in stopwords.words('english'): # take off leading brackets, commas etc... word_punct_nopunct = import_utilities.strip_punctuation_bool(word) word_nopunct = word_punct_nopunct['word'].strip(" \n\t\r") word_punct = word_punct_nopunct['punct'] punct_bool = word_punct_nopunct['punct_bool'] ####################################################### # MAIN EXCHANGE PROCESS CALL >>>>>>> GET THE SYNSET # ####################################################### if word_nopunct[-4:].lower()=="here": similarterm=random.choice(import_utilities.heres) else: #print "WORD:",word_nopunct if len(word_nopunct)>3: oscillator = oscillator+1 ############################################ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # STYLE SWITCH..... should in future use POS # ... i.e. if noun & oscillator%3, do... # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ############################################ # synset similarterm = import_utilities.synset_creeley(word_nopunct) #print "synset", similarterm if similarterm is not None and similarterm == word_nopunct and len(word_nopunct)>5: RESERVOIR.sort(key=len) similarterm= RESERVOIR[idx%len(RESERVOIR)] #print idx,len(RESERVOIR),similarterm,word_nopunct,"PRE>>>>>>>>LAST CHANGE STOP: ", word, "~",similarterm ############################################ # manually get rid of some terrible choices ############################################ naw_terms=["mind","lonely"] if similarterm == "ilk": ##print "like" similarterm = "like" if similarterm == "Nox": ##print "like" similarterm = "nite" if similarterm == "ope": ##print "doth" similarterm = "does" if similarterm == "information technology": ##print "doth" similarterm = "it" if similarterm == "velleity": ##print "doth" similarterm = "want" if similarterm == "Crataegus laevigata": ##print "doth" similarterm = "may" if similarterm == "eff": ##print "doth" similarterm = "know" if similarterm == "naw": ##print "doth" similarterm = "mind" ####################################### # abbreviations for f*****g states! # ####################################### if word_nopunct.upper() in import_utilities.state_abbrev and word_nopunct.lower() not in stopwords.words('english') and "me," not in word: tmp = similarterm if word_nopunct == "oh": similarterm = random.choice(import_utilities.exclaims) else: similarterm = random.choice(RESERVOIR) #print word_nopunct," replaced by", tmp, "replaced with:",similarterm, "in:",line ############## # hyphenated # ############## hyp =word.split("-") #print word,len(hyp) if len(hyp) >1: similarterm="" for w in hyp: if len(w) > 2: if import_utilities.synset_creeley(w) is not None: similarterm += import_utilities.synset_creeley(w)+"-" else: similarterm += w+"-" similarterm = import_utilities.strip_underscore(similarterm[:-1]) #print "hyphenated:",word,"replaced by: "+similarterm # ######################################################### # # is it a TRUNCATED VERB slang as in singin or wishin # # ######################################################### # if similarterm == word_nopunct and len(word)>2 and 'in' in word_nopunct[-2:]: # similarterm = import_utilities.synset_creeley(word_nopunct+'g') # ## #print "TRUNCATED SLANG word: '"+word+"'",similarterm # interim = import_utilities.lemma(similarterm) # ## #print interim # similarterm = import_utilities.conjugate(interim, tense=import_utilities.PARTICIPLE, parse=True)[:-1] # # # # #print word,"widx:",widx," line_pos_tags[widx][0]:",line_pos_tags[widx][0]," line_pos_tags[widx][1]:",line_pos_tags[widx][1] ################# # SWEAR WORD # ################# ##print "at the garden of if:", word if word_nopunct in import_utilities.curses: similarterm = random.choice(import_utilities.curses) ##print "SWEAR WORD word: '"+word+"'",similarterm #print "SIMILAR:",similarterm if similarterm is not None: if len(hyp) >1: replacement_word = similarterm else: replacement_word = word.replace(word_nopunct, similarterm) replacement_word = import_utilities.strip_underscore(replacement_word) replacement_word = import_utilities.replaceNumbers(replacement_word) ######################### # RESERVOIR_OF_WEIRDNESS # ######################### if word_nopunct.lower() in import_utilities.impera: replacement_word=random.choice(import_utilities.impera) #print word,"IMPERA:",replacement_word elif word_nopunct.lower() in import_utilities.conjuncts: replacement_word=random.choice(import_utilities.conjuncts) #print word," CONJUNCTION replaced with",replacement_word elif word_nopunct.lower() in import_utilities.indef_prono: replacement_word=random.choice(import_utilities.indef_prono) #print word," INDEF_prono replaced with",replacement_word elif word_nopunct.lower() in import_utilities.prepo: replacement_word=random.choice(import_utilities.prepo) #print word," prepo replaced with",replacement_word elif word_nopunct.lower() in import_utilities.rel_prono: replacement_word=word #print word," rel_prono LEAVE alone: ",replacement_word elif word_nopunct.lower()[-2:] =="ly": if import_utilities.synset_creeley(word) is not None: replacement_word=import_utilities.strip_underscore(import_utilities.synset_creeley(word))#(word[:-2]) #print word," ADVERB: ",replacement_word # if replacement_word[-2:] !="ly": # replacement_word +="ly" else: if len(hyp) <2 and "like" not in word_nopunct and import_utilities.singularize(word_nopunct) == import_utilities.singularize(replacement_word) and word_nopunct.lower() not in import_utilities.stopwords_ls: if word not in RESERVOIR and quit_language<0 and import_utilities.countPunctuation(word)<1 and len(word_nopunct)>3 and not word_nopunct.istitle(): #print "ADDING",word,"to reservoir" ############################ # ADDING ONLY SMALL WORDS ############################ if len(word)<7: RESERVOIR.append(word) replacement_word = random.choice(rap_mouth)# RESERVOIR) #print word_nopunct,"replaced from reservoir with", replacement_word # print "'"+word_nopunct+"' vs RESERVOIR replacement_word:",replacement_word #," new_line:",new_line if quit_language>1 and not word_nopunct.istitle(): #print quit_language, "Probably foreign language: make a word salad in english" replacement_word = random.choice(rap_mouth)#RESERVOIR) #print word_nopunct,"OTHER replaced from reservoir with", replacement_word ################################################### # MOST REPLACEMENT occurs here... # ################################################### poem_ls = poem_replaced.split(' ') idx = poem_ls.index(word) # print idx,",", poem_ls[idx],",", word ,",",replacement_word #print word ," --- ",previous_replacement_word,replacement_word if len(word)>3 and replacement_word.lstrip().rstrip() == word_nopunct.lstrip().rstrip(): # try alchemy? # a RESERVOIR.sort(key=len) replacement_word = RESERVOIR[idx%len(RESERVOIR)] #print idx,len(RESERVOIR),"LAST CHANGE STOP: ", word, "~",replacement_word try: if poem_ls[idx]==word and "****" not in word and "." != word and "\n" not in word: poem_ls[idx]=replacement_word#.encode('utf-8') poem_replaced = " ".join(poem_ls) # store this word so that conjugation can be checked previous_replacement_word=replacement_word except Exception, e: #print "PENULTIMATE SKIP_bool replace FAIL",e SKIP_bool=True continue ########################################################################### # testing Pattern.en as parser for conjugation and article replacement # # much more robust than my hand-coded hacks # ########################################################################### # correct CONJUGATion of paticiple verbs with pattern.en parsed = parse(poem_replaced,tags = True) pre_verbal = ["'m","'s","'re"] for idx,p in enumerate(parsed.split(" ")): tok =p.split("/")[0] typ=p.split("/")[1] #print idx,tok,typ if tok in pre_verbal: #print "pre_verbal:",tok next_word= parsed.split(" ")[idx+1].split("/") # try try try for ix,n in enumerate(next_word): next_word[ix] = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, n).encode('utf-8') try: #print next_word,next_word[0],next_word[1][:2] # if it's a verb that follows if next_word[1][:2] =="VB": before_verb = " ".join(w for w in poem_replaced.split(" ")[:idx])#.encode('utf-8') after_verb = " ".join(w for w in poem_replaced.split(" ")[idx+1:])#.encode('utf-8') new_verb = conjugate(next_word[0], tense=PARTICIPLE, parse=True).encode('utf-8') # insert new #print "CONJUGATION needed, changing:",poem_replaced.split(" ")[idx],"to",parsed.split(" ")[idx],poem_replaced.split(" ")[idx-1]+" "+new_verb poem_replaced = before_verb+" "+new_verb+" "+after_verb except Exception, e: # print "INside parsed COnjugation loop",e continue # correct ARTICLES for idx,word in enumerate(poem_replaced.split(" ")): if len(word)>0 and idx != 0 and " " not in word: # A or AN if poem_replaced.split(" ")[idx-1].lower() =="a" or poem_replaced.split(" ")[idx-1].lower() =="an": #print word,"---",article(word)+" "+word before_article = " ".join(w for w in poem_replaced.split(" ")[:idx-1]) after_article = " ".join(w for w in poem_replaced.split(" ")[idx+1:]) new_conj = referenced(word) # capitalize if poem_replaced.split(" ")[idx-1].istitle(): new_conj = new_conj.split(" ")[0].title()+" "+new_conj.split(" ")[1] poem_replaced = before_article+" "+new_conj+" "+after_article ######################### # WRITE SINGLE POEM # ######################### if not SKIP_bool: tmp_poem="" # poem_replaced.replace("\t","	") # poem_replaced.replace("\n"," <br>") # poem_replaced.replace("\r"," <br>") HTML_poem="" for line in poem_replaced.split("\n"): #print "LINE", line HTML_poem += line+"<br>" if len(response) >0 and len(id.split("_"))>1: # ALL_poems = ALL_poems_intro + " ".join(i for i in ALL_poems.split("</h2>.")[0:])+"<br><br>~~~~~~~~~~~~~~~~~~~~~~~~~~<br>[ A poem generated from template : <b>"+ author+"</b>, <i>"+ title +"</i> ]<br><br><b>"+new_title+"<br><br></b>"+HTML_poem # try: ALL_poems = "<br>[ A generated-poem based upon: <i>"+ title +"</i> by <b>"+ author+"</b>]<br><br><i>"+new_title+"</i><br> by <b>"+ new_author +"</b><br>"+HTML_poem+ALL_poems.split("</h2>")[1].replace(" "," ") tmp_poem= "[A generated-poem based upon: '"+ title+"' by "+ author +"]\n\n"+new_title+ "\nby "+new_author+"\n"+poem_replaced print "\n~~~\n" #+tmp_poem # SLOW TYPEWRITER PRESENTATION for line in tmp_poem: for c in line: time.sleep(0.04) sys.stdout.write(c)#(c.encode("utf8")) sys.stdout.flush() # #sys.stdout.write("\n") txt_fn = id.split("_")[1]+"_POEMs.txt" WRITE_BIO_PATH = DATA_DIR+"generated/POEMS/POEMS_"+datetime.datetime.now().strftime('%Y-%m-%d_%H')+"/" if not os.path.exists(WRITE_BIO_PATH): os.makedirs(WRITE_BIO_PATH) txt_fn_path = WRITE_BIO_PATH+txt_fn f_txt=open(txt_fn_path,'w') f_txt.write(tmp_poem)#.encode('utf-8')) f_txt.close(); #print "\nTXT file created at:",txt_fn_path ####### # write them all.... wasteful... but useful if run is interrupted.... ########### # if cnt==1: # ALL_poems = ALL_poems_intro+ALL_poems # else: ALL_poems = ALL_poems_intro+ALL_poems.replace(" "," ") ALL_poems = ALL_poems.replace("$$datetime$$",datetime.datetime.now().strftime('%Y-%m-%d at %H:%M')) ALL_poems = ALL_poems.replace("$$cnt$$",str(cnt)) #print "cnt",cnt ALL_poems = ALL_poems.replace("$$gentime$$",str(time.time() - start_time)) # ALL POEMS txt_fn = datetime.datetime.now().strftime('%Y-%m-%d_%H')+"_poetryFoundation_generatedPOEMS_CREELEYstyle_"+type_of_run+".html" txt_fn_path = DATA_DIR+"generated/POEMS/"+txt_fn f_txt=open(txt_fn_path,'w') f_txt.write(ALL_poems+"</hmtl>") f_txt.close(); #print "\nTXT file created at:",txt_fn_path # except Exception, e: # print "At the final LOOP",e # #continue # pass else: pass #print "~! EMPTY response:", author else: cnt = cnt-1
# The en module has a range of tools for word inflection: # guessing the indefinite article of a word (a/an?), # pluralization and singularization, comparative and superlative adjectives, verb conjugation. # INDEFINITE ARTICLE # ------------------ # The article() function returns the indefinite article (a/an) for a given noun. # The definitive article is always "the". The plural indefinite is "some". print(article("bear") + " bear") print("") # The referenced() function returns a string with article() prepended to the given word. # The referenced() funtion is non-trivial, as demonstrated with the exception words below: for word in ["hour", "one-liner", "European", "university", "owl", "yclept", "year"]: print(referenced(word)) print("") # PLURALIZATION # ------------- # The pluralize() function returns the plural form of a singular noun (or adjective). # The algorithm is robust and handles about 98% of exceptions correctly: for word in ["part-of-speech", "child", "dog's", "wolf", "bear", "kitchen knife"]: print(pluralize(word)) print(pluralize("octopus", classical=True)) print(pluralize("matrix", classical=True)) print(pluralize("matrix", classical=False)) print(pluralize("my", pos=ADJECTIVE)) print("") # SINGULARIZATION
from pattern.en import referenced, article from pattern.en import pluralize, singularize from pattern.en import conjugate, lemma, lexeme, tenses, PAST, PL from pattern.en import quantify from pattern.en import ngrams from pattern.en import parse, tag, pprint from pattern.en import sentiment, polarity, subjectivity, modality from pattern.en import Sentence #Indefinite article print article('university') print article('hour') print referenced('university') print referenced('hour') #singularity print pluralize('child') print singularize('wolves') # print print lexeme('run') print lemma('running') print conjugate('purred', '3sg') print PAST in tenses('purred') # 'p' in tenses() also works. print (PAST, 1, PL) in tenses('purred') print 'Quantification'
def indef(lemma): if lemma in ('someone', 'something'): return lemma return referenced(lemma, article=INDEFINITE)
def do_animal_conversation(self, i, j): """Make two animals talk to one another""" # Pick two animals and make sure they're not the same one to = random.choice(self.ANIMALS) fro = random.choice(self.ANIMALS) while fro == to: fro = random.choice(self.ANIMALS) # Check to make sure the these two animals didn't have a conversation already (or at least the "to" # animal didn't already initiate a conversation with the "from" animal already = False if to['name'] + to['animal'] in self.CONVOS.keys(): if fro['name'] + fro['animal'] in self.CONVOS[to['name'] + to['animal']]: already = True # If this is a new conversation, continue if not already: self.TEMP += "\n" + to['name'] + ' asked ' + fro[ 'name'] + ', "What exactly are you?"\n' self.TEMP += "\"Well, I'm " + referenced(fro['animal']) # If the "fro" animal has some properties in ConceptNet, print one randomly if "HasProperty" in self.ANIMAL_CONCEPTS[fro['animal']].keys(): self.TEMP += " and I'm " + self.clean_phrase( singularize( random.choice(self.ANIMAL_CONCEPTS[fro['animal']] ['HasProperty']))) self.TEMP += "." # If the "fro" animal has a "HasA" relationship in ConceptNet, print one randomly hasa = False if "HasA" in self.ANIMAL_CONCEPTS[fro['animal']].keys(): has = referenced( singularize( random.choice( self.ANIMAL_CONCEPTS[fro['animal']]['HasA']))) self.TEMP += " I have " + self.clean_phrase(has) hasa = True # If the "fro" animal is capable of something, talk about it capable = False if "CapableOf" in self.ANIMAL_CONCEPTS[fro['animal']].keys(): capable = True ability = random.choice( self.ANIMAL_CONCEPTS[fro['animal']]['CapableOf']) # Sometimes the "CapableOf" relationship in ConceptNet is negated, so make # sure we have consistent logic can = "can" if ability.find("cannot ") == 0: can = "cannot" ability.replace("cannot ", "") if hasa: self.TEMP += " and" # State the ability and ask the "to" animal if they can do the same thing self.TEMP += " I " + can + " " + self.clean_phrase( ability) + ", can you?" if hasa and not capable: self.TEMP += "." self.TEMP += "\"\n" # If there was a stated ability for the "fro" animal if capable: # Check to see if the "to" animal also has the same ability, and if so say so canto = False if 'CapableOf' in self.ANIMAL_CONCEPTS[to['animal']].keys(): if ability in self.ANIMAL_CONCEPTS[ to['animal']]['CapableOf']: canto = True self.TEMP += '"Yes I can!"' # If not, say so if not canto: self.TEMP += '"No I can' + "'t" # If they have other abilities, though, pick one and print it if 'CapableOf' in self.ANIMAL_CONCEPTS[ to['animal']].keys(): self.TEMP += ", but I do know how to " + self.clean_phrase( random.choice(self.ANIMAL_CONCEPTS[to['animal']] ['CapableOf'])) + "!" else: self.TEMP += "," self.TEMP += '" replied ' + to['name'] + ".\n" # Add the conversation to the list if to not in self.CONVOS.keys(): self.CONVOS[to['name'] + to['animal']] = [] self.CONVOS[to['name'] + to['animal']].append(fro['name'] + fro['animal']) self.THEN = False
# The en module has a range of tools for word inflection: # guessing the indefinite article of a word (a/an?), # pluralization and singularization, comparative and superlative adjectives, verb conjugation. # INDEFINITE ARTICLE # ------------------ # The article() command returns the indefinite article (a/an) for a given noun. # The definitive article is always "the". The plural indefinite is "some". print article("bear"), "bear" print # The referenced() command returns a string with article() prepended to the given word. # The referenced() command is non-trivial, as demonstrated with the exception words below: for word in ["hour", "one-liner", "European", "university", "owl", "yclept", "year"]: print referenced(word) print print # PLURALIZATION # ------------- # The pluralize() command returns the plural form of a singular noun (or adjective). # The algorithm is robust and handles about 98% of exceptions correctly: for word in ["part-of-speech", "child", "dog's", "wolf", "bear", "kitchen knife"]: print pluralize(word) print pluralize("octopus", classical=True) print pluralize("matrix", classical=True) print pluralize("matrix", classical=False) print pluralize("my", pos=ADJECTIVE) print
f = """this’s pattern word tokenize""" print "tokens:", tokenize(f) sent_tokenize_test = """Tokenization is the process of breaking a stream of text up into words, phrases, symbols, or other meaningful elements called tokens. The list of tokens becomes input for further processing such as parsing or text mining. Tokenization is useful both in linguistics (where it is a form of text segmentation), and in computer science, where it forms part of lexical analysis.""" print "sentence:",tokenize(sent_tokenize_test) from pattern.en import tag g = """In corpus linguistics, part-of-speech tagging (POS tagging or POST), also called grammatical tagging or word-category disambiguation, is the process of marking up a word in a text (corpus) as corresponding to a particular part of speech, based on both its definition, as well as its context—i.e. relationship with adjacent and related words in a phrase, sentence, or paragraph. A simplified form of this is commonly taught to school-age children, in the identification of words as nouns, verbs, adjectives, adverbs, etc.""" tagged_result = tag(g) print tagged_result from pattern.en import referenced referenced('book') from pattern.en import singularize singularize('wolves') from pattern.en import comparative comparative('bad') #‘worse’ from pattern.en import superlative from pattern.en import pluralize from pattern.en import sentiment
def enum_or(words): if len(words) == 1: return referenced(words[0]) r = [referenced(w) for w in words] return '{0}, or {1}'.format(', '.join(r[:-1]), r[-1])
# the Free Software Foundation; either version 3, or (at your option) # any later version. # This file is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with GNU Emacs; see the file COPYING. If not, write to # the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, # Boston, MA 02110-1301 USA, from pattern.en import referenced print(referenced('university')) print(referenced('hour')) from pattern.en import pluralize, singularize print(pluralize('child')) print(singularize('wolves')) from pattern.en import comparative, superlative print(comparative('bad')) print(superlative('bad')) from pattern.en import conjugate, lemma, lexeme print(lexeme('purr'))
def article_check(sentence): #sentence is a list of words s = " ".join(sentence) words = s.split() tags = tag(s) # print(tags) pos1 = [] #positions of NP,NNP pos2 = [] ans1 = [] ans2 = [] cnt = 0 b = False #determiner in stack f = False #1st NN/JJ/RR seen pronoun = False for i in range(len(tags)): word, pos = tags[i] if pos == "PRP$" or pos == "POS" or word == "i": pronoun = True elif word == "the" or word == "a" or word == "an": pos1.append([i, 0, 0]) # cnt+=1 # print(cnt) b = True elif (pos == "NN" or pos == "NNS" or pos == "NNP" or pos == "NNPS" or pos == "JJ" or pos == "JJR" or pos == "JJS" or pos == "RR" or pos == "RBR" or pos == "RBS"): if not pronoun: if b: if not f: pos1[cnt][1] = i pos1[cnt][2] = pos cnt += 1 f = True if pos == "NN" or pos == "NNS" or pos == "NNP" or pos == "NNPS": b = False f = False else: pos2.append([i, pos]) else: pronoun = False # print("pos1",pos1) # print("pos2",pos2) adj = False adject = False first = True # insertPos=False for indDT, indN, pos in pos1: word = tags[indN][0] dt = referenced(word).split()[0] DT = aORthe(dt, word) ans1.append([indDT, indN, DT]) # if pos=="JJ" or pos=="JJR" or pos =="JJS" or pos=="RR" or pos=="RBR" or pos =="RBS": # if first: # # insertPos=indN-1 # adj=pos # adject=word # first=False # if pos=="NN": # if adj=="JJS" or adj=="RBS": # # dt=referenced(tags[indN][0]) # ans1.append([indDT,indN,"the"]) # adj=False # first=True # adject=False # elif adj=="JJ" or adj=="JJR" or adj=="RBR" or adj=="RR": # dt=referenced(adject).split()[0] # DT=aORthe(dt,adject) # ans1.append([indDT,indN,DT]) # adj=False # first=True # adject=False # else: # dt=referenced(word).split()[0] # DT=aORthe(dt,word) # ans1.append([indDT,indN,DT]) # adj=False # first=True # adject=False adj = False adject = False first = True insertPos = False for indN, pos in pos2: word = tags[indN][0] if pos == "JJ" or pos == "JJR" or pos == "JJS": if first: insertPos = indN adj = pos adject = word first = False elif pos == "NN": if adj == "JJS" or adj == "RBS": # dt=referenced(tags[indN][0]) ans2.append([insertPos, indN, "the"]) adj = False first = True adject = False elif adj == "JJ" or adj == "JJR" or adj == "RBR" or adj == "RR": dt = referenced(adject).split()[0] DT = aORthe(dt, adject) ans2.append([insertPos, indN, DT]) adj = False first = True adject = False else: dt = referenced(word).split()[0] DT = aORthe(dt, word) ans2.append([insertPos, indN, DT]) adj = False first = True adject = False s1 = "" for ws in words: s1 = s1 + ws + " " # print("replace: (insertPos,-,dt)") # print(ans1) aans1 = [[] for word in sentence] for insertPos, nounPOS, dt in ans1: aans1[insertPos] = dt # print(aans1) # print("insert: (insertPos,-,dt)") aans2 = [[] for word in sentence] for insertPos, nounPOS, dt in ans2: # print(insertPos, nounPOS, dt) # print(insertPos==False) if not insertPos: aans2[nounPOS] = [dt1 + " " + words[nounPOS] for dt1 in dt] else: aans2[insertPos] = [dt1 + " " + words[insertPos] for dt1 in dt] # print(aans2) # print(aans1) f_ans = [] for i in range(len(aans2)): f_ans.append(aans2[i] + aans1[i]) return f_ans
from pattern.en import referenced import random import subprocess with open('persons.txt') as f: persons = f.read().split('\n') with open('actions.txt') as f: actions = f.read().split('\n') random.shuffle(persons) random.shuffle(actions) text = '''% CONSIDER A NOVEL # Chapter 1 Consider ''' text += ', or '.join(referenced(p) for p in persons) text += ' who is ' text += ', '.join(actions) text += '.' with open('novel.md', 'w') as f: f.write(text) subprocess.call('pandoc -o novel.pdf novel.md')
def do_animal_conversation(self, i, j): """Make two animals talk to one another""" # Pick two animals and make sure they're not the same one to = random.choice(self.ANIMALS) fro = random.choice(self.ANIMALS) while fro == to: fro = random.choice(self.ANIMALS) # Check to make sure the these two animals didn't have a conversation already (or at least the "to" # animal didn't already initiate a conversation with the "from" animal already = False if to['name']+to['animal'] in self.CONVOS.keys(): if fro['name']+fro['animal'] in self.CONVOS[to['name']+to['animal']]: already = True # If this is a new conversation, continue if not already: self.TEMP += "\n" + to['name'] + ' asked ' + fro['name'] + ', "What exactly are you?"\n' self.TEMP += "\"Well, I'm " + referenced(fro['animal']) # If the "fro" animal has some properties in ConceptNet, print one randomly if "HasProperty" in self.ANIMAL_CONCEPTS[fro['animal']].keys(): self.TEMP += " and I'm " + self.clean_phrase(singularize(random.choice(self.ANIMAL_CONCEPTS[fro['animal']]['HasProperty']))) self.TEMP += "." # If the "fro" animal has a "HasA" relationship in ConceptNet, print one randomly hasa = False if "HasA" in self.ANIMAL_CONCEPTS[fro['animal']].keys(): has = referenced(singularize(random.choice(self.ANIMAL_CONCEPTS[fro['animal']]['HasA']))) self.TEMP += " I have " + self.clean_phrase(has) hasa = True # If the "fro" animal is capable of something, talk about it capable = False if "CapableOf" in self.ANIMAL_CONCEPTS[fro['animal']].keys(): capable = True ability = random.choice(self.ANIMAL_CONCEPTS[fro['animal']]['CapableOf']) # Sometimes the "CapableOf" relationship in ConceptNet is negated, so make # sure we have consistent logic can = "can" if ability.find("cannot ") == 0: can = "cannot" ability.replace("cannot ", "") if hasa: self.TEMP += " and" # State the ability and ask the "to" animal if they can do the same thing self.TEMP += " I " + can + " " + self.clean_phrase(ability) + ", can you?" if hasa and not capable: self.TEMP += "." self.TEMP += "\"\n" # If there was a stated ability for the "fro" animal if capable: # Check to see if the "to" animal also has the same ability, and if so say so canto = False if 'CapableOf' in self.ANIMAL_CONCEPTS[to['animal']].keys(): if ability in self.ANIMAL_CONCEPTS[to['animal']]['CapableOf']: canto = True self.TEMP += '"Yes I can!"' # If not, say so if not canto: self.TEMP += '"No I can' + "'t" # If they have other abilities, though, pick one and print it if 'CapableOf' in self.ANIMAL_CONCEPTS[to['animal']].keys(): self.TEMP += ", but I do know how to " + self.clean_phrase(random.choice(self.ANIMAL_CONCEPTS[to['animal']]['CapableOf'])) + "!" else: self.TEMP += "," self.TEMP += '" replied ' + to['name'] + ".\n" # Add the conversation to the list if to not in self.CONVOS.keys(): self.CONVOS[to['name']+to['animal']] = [] self.CONVOS[to['name']+to['animal']].append(fro['name']+fro['animal']) self.THEN = False
def articleError(text, nlp, correctFlag=False): ''' Purpose: To check if text has errors due to wrong article usage. Additionally, it returns corrected sentence. Parameters: text: string A string of text-single or a paragraph. correctFlag:boolean True or False Returns: count: integer text: Corrected sentence. (If correctFlag is True) ''' path = "uncNouns.txt" unc_text = read_file(path) unc_words = [] for i in unc_text: tokens = word_tokenize(i) unc_words.append(tokens[0].lower()) doc = nlp(text) count = 0 ntext = "" for s in doc.sentences: for t in range(len(s.words)): if s.words[t].text == 'a' or s.words[t].text == 'an': if ((s.words[t + 1].text in unc_words) or s.words[t + 1].xpos == "NNS" or s.words[t + 1].xpos == "NNPS"): count += 1 elif (t < len(s.words) - 2) and (s.words[t + 1].xpos in [ "JJ", "JJR" ]) and (s.words[t + 2].xpos in ['NNP', 'NN']): if (s.words[t].text == 'a' and referenced(s.words[t + 1].text) == ('an ' + s.words[t + 1].text)): ntext += 'an' count += 1 elif (s.words[t].text == 'an' and referenced(s.words[t + 1].text) == ('a ' + s.words[t + 1].text)): ntext += 'a' count += 1 else: ntext += s.words[t].text elif (s.words[t + 1].xpos not in ["NNP", "NN"]): count += 1 elif (s.words[t].text == 'a' and referenced( s.words[t + 1].text) == ('an ' + s.words[t + 1].text)): ntext += 'an' count += 1 elif (s.words[t].text == 'an' and referenced( s.words[t + 1].text) == ('a ' + s.words[t + 1].text)): ntext += 'a' count += 1 else: ntext += s.words[t].text ntext += " " # elif (t<len(s.words)-1) and (s.words[t].xpos in ["JJ","JJR"]) and (s.words[t+1].xpos in ['NNP','NN']): # ntext+=referenced(s.words[t].text)+" " else: ntext += s.words[t].text ntext += " " if correctFlag == True: return count, ntext else: return count
def extractFeaturesAndWriteBio(READ_PATH, file_type): global ALL_poems, bio, cnt for subdir, dirs, files in os.walk(READ_PATH): for file in files: num_of_files = len(files) - 1 # deduct the DS_store #print (num_of_files,'readDirectory',READ_PATH) if file_type in file and 'readme' not in file: # ID id = file.split(".")[0] print "\nID:", id.split("_")[1] filenames.append(id) cnt += 1 # print('') # print('') # print('OPENED:',id) # print('') # print('') poem_replaced = "" replacement_word = "" previous_replacement_word = "" author = "" titles = "" title = "" new_title = "" replaced_ls = [] new_titles_ls = [] quit_language = 0 oscillator = 0 # if EXCEPTION is raised... do not add to html SKIP_bool = False ########################## # Load POEM TEXT FILE # ########################## txt_fn_path = DATA_DIR + READ_TXT_PATH + id.split( "_")[1] + ".txt" #print "txt_fn_path:",txt_fn_path if os.path.isfile(txt_fn_path) and cnt > 0: txt_data = open(txt_fn_path).read() # http://blog.webforefront.com/archives/2011/02/python_ascii_co.html # txt_data.decode('ISO-8859-2') .decode('utf-8') # unicode(txt_data) author = txt_data.split("****!****")[0].strip(' \t\n\r') title = txt_data.split("****!****")[1].strip(' \t\n\r') bio = txt_data.split("****!****")[2] #.strip(' \t\n\r') ###### CLEAN BIO bio.replace("\t", "	") bio.replace("\n", " <br>") bio.replace("\r", " <br>") poem_replaced = bio #print poem_replaced ############################### # REPLACE AUTHOR NAME in poem ############################## author_ln = author.split(" ")[-1].lstrip() author_fn = author.split(" ")[:-1] author = " ".join(n for n in author_fn) + author_ln # #poem_replaced = poem_replaced.replace(author_ln,"Jhave") ####################### # replace BOOK TITLES ####################### #print "TITLES"] new_title = getNewTitle("title").encode('utf-8') ####################### # fake AUTHOR ####################### new_author = " ".join( random.choice(authors).split(" ") [1:-2]) + " " + random.choice(authors).split(" ")[-2] #print "new AUTHOR",new_author ############################ # replace years with another ############################ for w1 in poem_replaced.split("("): for w2 in w1.split(")"): if w2 is not None and w2.isdigit(): new_num = random.randint( int(w2) - 5, int(w2) + 5) #print "REPLACING #:",w2,new_num poem_replaced = poem_replaced.replace( w2, str(new_num)) replaced_ls.append(new_num) ################# # Load JSON # ################# response = loadJSONfile(READ_JSON_PATH + "poetryFoundation_" + id.split("_")[1] + "_Alchemy_JSON.txt") if response != "failed": if response.get('entities') is not None: for idx, entity in enumerate(response['entities']): #print idx ce = entity['text'].replace("0xc2", " ") ce = ce.replace("0xe2", "'") ce = re.sub( '(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, ce) ce = ce.encode('utf-8') try: content = ce.decode('utf-8').encode( 'ascii', 'xmlcharrefreplace') except UnicodeDecodeError: "AAAARGGGGHHH!!!!" if content in poem_replaced: ################################################ # Replace similar entities from other JSON # ################################################ replacement_entity = findSimilarEntityinRandomJSON( content, entity['type']) cr = re.sub( '(' + '|'.join( import_utilities.chars.keys()) + ')', import_utilities.replace_chars, replacement_entity) poem_replaced = poem_replaced.replace( content, replacement_entity) replaced_ls.append(replacement_entity) ########################## # POS REPLACMENT # ########################## token_tuples = nltk.word_tokenize(poem_replaced) tt = nltk.pos_tag(token_tuples) ################# # ADJECTIVES # ################# for i in tt: if "/i" not in i[0] and len( i[0]) > 2 and i[0] != "died": origw = re.sub( '(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, i[0]) origw = import_utilities.strip_punctuation(origw) if i[1] == 'JJ': JJr = random.choice(JJ) # # JJr = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, JJr) # JJr = import_utilities.strip_punctuation(JJr) JJr = import_utilities.moveBeginAndEndPunctuationFromStrToString( i[0], JJr.lstrip().lstrip()) if i[0].istitle(): JJr = JJr.title() poem_replaced = re.sub( r'\b' + import_utilities.strip_punctuation(i[0]) + r'\b', JJr, poem_replaced, 1) #poem_replaced.replace(i[0],JJr,1) replaced_ls.append(JJr) if i[1] == 'RB': RBr = random.choice(RB) RBr = import_utilities.moveBeginAndEndPunctuationFromStrToString( i[0], RBr.lstrip().lstrip()) if i[0].istitle(): RBr = RBr.title() poem_replaced = re.sub( r'\b' + import_utilities.strip_punctuation(i[0]) + r'\b', RBr, poem_replaced, 1) replaced_ls.append(RBr) #print "RBr=",RBr,"repaced",i[0] ######################## # IS IT ENGLISH? # ######################## for line in poem_replaced.split('\n\r'): if len(line) > 0: if "english" not in import_utilities.get_language( line): quit_language += 1 #print "NOT english:",quit_language,line else: quit_language -= 1 ######################### # SYNSET REPLACE # ######################### for idx, word in enumerate(poem_replaced.split(' ')): if "<br>" not in word and "	" not in word and len( word) > 0: ######################### # PRONOUN ' VERB # ######################### if len(word.split("'")) > 1: if word.split("'")[0] in personal_pronouns: replacement_word = random.choice( personal_pronouns) + "'" + word.split( "'")[1] + ' ' poem_replaced.replace(word, replacement_word) #print "word,",word,"replacement_word:",replacement_word #################################################### # Replacement of OTHERs # #################################################### else: # elif not word.lower().strip(" \n\t\r") in stopwords.words('english'): # take off leading brackets, commas etc... word_punct_nopunct = import_utilities.strip_punctuation_bool( word) word_nopunct = word_punct_nopunct[ 'word'].strip(" .\n\t\r") word_punct = word_punct_nopunct['punct'] punct_bool = word_punct_nopunct['punct_bool'] #print "word_nopunct:",word_nopunct ####################################################### # MAIN EXCHANGE PROCESS CALL >>>>>>> GET THE SYNSET # ####################################################### similarterm = "" if word_nopunct[-4:].lower() == "here": similarterm = random.choice( import_utilities.heres) else: #print "WORD:",word_nopunct if len(word_nopunct) > 3: oscillator = oscillator + 1 ############################################ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # STYLE SWITCH..... should in future use POS # ... i.e. if noun & oscillator%3, do... # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ############################################ similarterm = import_utilities.find_synset_word( word_nopunct) # synset # if oscillator%4==0: # # SYNSET # similarterm = import_utilities.find_synset_word(word_nopunct) # #print "synset", similarterm # elif oscillator%3==0: # # RAP MOUTH # similarterm = random.choice(rap_mouth) # #print "rap",similarterm # # elif oscillator%2==0: # else: # similarterm = import_utilities.find_synset_word(word_nopunct) # # SCIENCE MOUTH # similarterm = random.choice(science_mouth) # if similarterm.endswith('logy'): # similarterm = similarterm[:-4] # if similarterm.endswith('o'): # similarterm = similarterm[:-1] #print "science_mouth",similarterm # if len(similarterm)<6: # similarterm = random.choice(import_utilities.curses) # else: # # FILTH # print "filth" # similarterm = random.choice(import_utilities.curses) ############################################ # manually get rid of some terrible choices ############################################ if similarterm == "ilk": ##print "like" similarterm = "like" if similarterm == "ope": ##print "doth" similarterm = "does" if similarterm == "information technology": ##print "doth" similarterm = "it" if similarterm == "velleity": ##print "doth" similarterm = "want" if similarterm == "Crataegus laevigata": ##print "doth" similarterm = "may" if similarterm == "brunet" or similarterm == "ot": ##print "doth" similarterm = random.choice( import_utilities.curses) if similarterm == "ge": ##print "doth" similarterm = random.choice(science_mouth) if similarterm.lower() == "nox": ##print "doth" similarterm = random.choice(science_mouth) if similarterm.lower() == "paunited": print "################### paUnited ###################" similarterm = word ####################################### # abbreviations for f*****g states! # ####################################### if word_nopunct.upper( ) in import_utilities.state_abbrev and word_nopunct.lower( ) not in stopwords.words( 'english') and "me," not in word: tmp = similarterm if word_nopunct == "oh": similarterm = random.choice( import_utilities.exclaims) else: similarterm = random.choice( rap_mouth) # RESERVOIR)RESERVOIR) #print word_nopunct," replaced by", tmp, "replaced with:",similarterm, "in:",line ############## # hyphenated # ############## hyp = word.split("-") #print word,len(hyp) if len(hyp) > 1: similarterm = "" for w in hyp: if len(w) > 2: similarterm += import_utilities.find_synset_word( w) + "-" similarterm = import_utilities.strip_underscore( similarterm[:-1]) #print "hyphenated:",word,"replaced by: "+similarterm ######################################################### # is it a TRUNCATED VERB slang as in singin or wishin # ######################################################### if similarterm == word_nopunct and len( word ) > 2 and 'in' in word_nopunct[-2:]: similarterm = import_utilities.find_synset_word( word_nopunct + 'g') #print "TRUNCATED SLANG word: '"+word+"'",similarterm interim = import_utilities.lemma( similarterm) ## #print interim similarterm = import_utilities.conjugate( interim, tense=import_utilities.PARTICIPLE, parse=True)[:-1] # # # #print word,"widx:",widx," line_pos_tags[widx][0]:",line_pos_tags[widx][0]," line_pos_tags[widx][1]:",line_pos_tags[widx][1] ################# # SWEAR WORD # ################# ##print "at the garden of if:", word if word_nopunct in import_utilities.curses: similarterm = random.choice( import_utilities.curses) #print "SWEAR WORD word: '"+word+"'",similarterm if len(hyp) > 1: replacement_word = similarterm else: replacement_word = word.replace( word_nopunct, similarterm) replacement_word = import_utilities.strip_underscore( replacement_word) replacement_word = import_utilities.replaceNumbers( replacement_word) #print "replacement_word:",replacement_word ######################### # RESERVOIR_OF_WEIRDNESS # ######################### if word_nopunct.lower( ) in import_utilities.impera: replacement_word = random.choice( import_utilities.impera) #print word,"IMPERA:",replacement_word elif word_nopunct.lower( ) in import_utilities.conjuncts: replacement_word = random.choice( import_utilities.conjuncts) #print word," CONJUNCTION replaced with",replacement_word elif word_nopunct.lower( ) in import_utilities.indef_prono: replacement_word = random.choice( import_utilities.indef_prono) #print word," INDEF_prono replaced with",replacement_word elif word_nopunct.lower( ) in import_utilities.prepo: replacement_word = random.choice( import_utilities.prepo) #print word," prepo replaced with",replacement_word elif word_nopunct.lower( ) in import_utilities.rel_prono: replacement_word = word #print word," rel_prono LEAVE alone: ",replacement_word elif word_nopunct.lower( )[-2:] == "ly" or word_nopunct.lower( )[-3:] == "ly.": replacement_word = import_utilities.strip_underscore( import_utilities.find_synset_word( word)) #(word[:-2]) #print word," ADVERB: ",replacement_word # if replacement_word[-2:] !="ly": # replacement_word +="ly" else: if len( hyp ) < 2 and "like" not in word_nopunct and import_utilities.singularize( word_nopunct ) == import_utilities.singularize( replacement_word ) and word_nopunct.lower( ) not in import_utilities.stopwords_ls: if word not in RESERVOIR and import_utilities.countPunctuation( word ) < 1 and len( word_nopunct ) > 3 and not word_nopunct.istitle(): if len( word ) > 4 and english_dict.check(word): #print "ADDING",word,"to reservoir" RESERVOIR.append(word) #RESERVOIR = list(set()) replacement_word = random.choice( RESERVOIR) #print word_nopunct,"replaced from reservoir with", replacement_word # print "'"+word_nopunct+"' vs RESERVOIR replacement_word:",replacement_word #," new_line:",new_line if quit_language > 1 and not word_nopunct.istitle( ): #print quit_language, "Probably foreign language: make a word salad in english" replacement_word = random.choice( rap_mouth) #RESERVOIR) #print word_nopunct,"OTHER replaced from reservoir with", replacement_word ################################################### # MOST REPLACEMENT occurs here... # ################################################### poem_ls = poem_replaced.split(' ') idx = poem_ls.index(word) #print idx,",", poem_ls[idx],",", word ,",",replacement_word #print word ," --- ",previous_replacement_word,replacement_word try: #print "poem_ls[idx]",poem_ls[idx],"word",word if poem_ls[ idx] == word and "****" not in word and "." != word and "\n" not in word: # if "\n" in word: # replacement_word=replacement_word+"\n" # if replacement_word=="": # replacement_word=random.choice(RESERVOIR) poem_ls[ idx] = replacement_word #.encode('utf-8') "REPLACE", word, "with", replacement_word poem_replaced = " ".join(poem_ls) # store this word so that conjugation can be checked previous_replacement_word = replacement_word except Exception, e: print "PENULTIMATE SKIP_bool replace FAIL", e SKIP_bool = True continue ########################################################################### # testing Pattern.en as parser for conjugation and article replacement # # much more robust than my hand-coded hacks # ########################################################################### # correct CONJUGATion of paticiple verbs with pattern.en parsed = parse(poem_replaced, tags=True) pre_verbal = ["'m", "'s", "'re"] for idx, p in enumerate(parsed.split(" ")): tok = p.split("/")[0] typ = p.split("/")[1] #print idx,tok,typ if tok in pre_verbal: #print "pre_verbal:",tok next_word = parsed.split(" ")[idx + 1].split("/") # try try try for ix, n in enumerate(next_word): next_word[ix] = re.sub( '(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, n).encode('utf-8') try: #print next_word,next_word[0],next_word[1][:2] # if it's a verb that follows if next_word[1][:2] == "VB": before_verb = " ".join( w for w in poem_replaced.split(" ") [:idx]) #.encode('utf-8') after_verb = " ".join( w for w in poem_replaced.split(" ") [idx + 1:]) #.encode('utf-8') new_verb = conjugate( next_word[0], tense=PARTICIPLE, parse=True).encode('utf-8') # insert new #print "CONJUGATION needed, changing:",poem_replaced.split(" ")[idx],"to",parsed.split(" ")[idx],poem_replaced.split(" ")[idx-1]+" "+new_verb poem_replaced = before_verb + " " + new_verb + " " + after_verb except Exception, e: #print "INside parsed COnjugation loop",e continue # correct ARTICLES for idx, word in enumerate(poem_replaced.split(" ")): if len(word) > 0 and idx != 0 and " " not in word: # A or AN if poem_replaced.split(" ")[idx - 1].lower( ) == "a" or poem_replaced.split(" ")[ idx - 1].lower() == "an": #print word,"---",article(word)+" "+word before_article = " ".join( w for w in poem_replaced.split(" ")[:idx - 1]) after_article = " ".join( w for w in poem_replaced.split(" ")[idx + 1:]) new_conj = referenced(word) # capitalize if poem_replaced.split(" ")[idx - 1].istitle(): new_conj = new_conj.split(" ")[0].title( ) + " " + new_conj.split(" ")[1] poem_replaced = before_article + " " + new_conj + " " + after_article ######################### # WRITE SINGLE POEM # ######################### if not SKIP_bool: tmp_poem = "" # poem_replaced.replace("\t","	") # poem_replaced.replace("\n"," <br>") # poem_replaced.replace("\r"," <br>") HTML_poem = "" for line in poem_replaced.split("\n"): #print "LINE", line HTML_poem += line + "<br>" if len(response) > 0 and len(id.split("_")) > 1: # ALL_poems = ALL_poems_intro + " ".join(i for i in ALL_poems.split("</h2>.")[0:])+"<br><br>~~~~~~~~~~~~~~~~~~~~~~~~~~<br>[ A poem generated from template : <b>"+ author+"</b>, <i>"+ title +"</i> ]<br><br><b>"+new_title+"<br><br></b>"+HTML_poem try: ALL_poems = "<br>[ A generated-poem based upon: <i>" + title + "</i> by <b>" + author + "</b>]<br><br><i>" + new_title + "</i><br> by <b>" + new_author + "</b><br>" + HTML_poem + ALL_poems.split( "</h2>")[1].replace(" ", " ") tmp_poem = "[A generated-poem based upon: '" + title + "' by " + author + "]\n\n" + new_title + "\nby " + new_author + "\n" + poem_replaced print "\n~~~\n\n" + tmp_poem #print "\nORIGINAL:",bio txt_fn = id.split("_")[1] + "_POEMs.txt" WRITE_BIO_PATH = DATA_DIR + "generated/POEMS/POEMS_" + datetime.datetime.now( ).strftime('%Y-%m-%d_%H') + "/" if not os.path.exists(WRITE_BIO_PATH): os.makedirs(WRITE_BIO_PATH) txt_fn_path = WRITE_BIO_PATH + txt_fn f_txt = open(txt_fn_path, 'w') f_txt.write(tmp_poem) #.encode('utf-8')) f_txt.close() #print "\nTXT file created at:",txt_fn_path ####### # write them all.... wasteful... but useful if run is interrupted.... ########### # if cnt==1: # ALL_poems = ALL_poems_intro+ALL_poems # else: ALL_poems = ALL_poems_intro + ALL_poems.replace( " ", " ") ALL_poems = ALL_poems.replace( "$$datetime$$", datetime.datetime.now().strftime( '%Y-%m-%d at %H:%M')) ALL_poems = ALL_poems.replace( "$$cnt$$", str(cnt)) #print "cnt",cnt ALL_poems = ALL_poems.replace( "$$gentime$$", str(time.time() - start_time)) # ALL POEMS txt_fn = datetime.datetime.now().strftime( '%Y-%m-%d_%H' ) + "_poetryFoundation_generatedPOEMS_" + type_of_run + ".html" txt_fn_path = DATA_DIR + "generated/POEMS/" + txt_fn f_txt = open(txt_fn_path, 'w') f_txt.write(ALL_poems + "</hmtl>") f_txt.close() #print "\nTXT file created at:",txt_fn_path except Exception, e: print "At the final LOOP", e continue else: print "~! EMPTY response:", author else: cnt = cnt - 1
# guessing the indefinite article of a word (a/an?), # pluralization and singularization, comparative and superlative adjectives, verb conjugation. # INDEFINITE ARTICLE # ------------------ # The article() function returns the indefinite article (a/an) for a given noun. # The definitive article is always "the". The plural indefinite is "some". print article("bear"), "bear" print # The referenced() function returns a string with article() prepended to the given word. # The referenced() funtion is non-trivial, as demonstrated with the exception words below: for word in [ "hour", "one-liner", "European", "university", "owl", "yclept", "year" ]: print referenced(word) print print # PLURALIZATION # ------------- # The pluralize() function returns the plural form of a singular noun (or adjective). # The algorithm is robust and handles about 98% of exceptions correctly: for word in [ "part-of-speech", "child", "dog's", "wolf", "bear", "kitchen knife" ]: print pluralize(word) print pluralize("octopus", classical=True) print pluralize("matrix", classical=True) print pluralize("matrix", classical=False) print pluralize("my", pos=ADJECTIVE)
def extractFeaturesAndWritePoem(READ_PATH,file_type): global ALL_poems,bio,cnt,SMALL_POEM,SMALL_POEM_ALL inp=0 sub_cnt=0 words_total=0 lines_total=0 pause_every = 0 for subdir, dirs, files in os.walk(READ_PATH): #print "randomizing",datetime.datetime.now() random.seed(datetime.datetime.now()) random.shuffle(files) for file in files: num_of_files = len(files)-1 # deduct the DS_store #print (num_of_files,'readDirectory',READ_PATH) if file_type in file and 'readme' not in file: JSON_alchemy_loaded = False # ID id=file.split(".")[0] #print "\nID:",id.split("_")[1] filenames.append(id) cnt+=1 # print('') # print('') # print('OPENED:',id) # print('') # print('') ############## # HOW MANY? # ############## sub_cnt+=1 if sub_cnt>=int(inp): if int(inp) != 0: end_time = time.time() es = end_time-start_time print "\n",sub_cnt, "poems,\n",lines_total,"lines,\n",words_total,"words \ngenerated in\n",("%.2f" % es),"seconds" words_total=0 lines_total=0 # RESTART sub_cnt=0 inp = raw_input("\n\n^^^^^^^^^^^^^^\n\nHow many poems do u want? ") if not inp: print "You entered nothing! 10 poems will be generated." inp=10 pause_every = raw_input("\nPause every 1 or 2 or ... poems?") if not pause_every: print "You entered nothing! Pause will occur every 10 poems." pause_every=10 sleep_time = raw_input("\nPause for how many seconds?") if not sleep_time: print "You entered no time! 10 second wait assigned." sleep_time=10 print "\n\n^^^^^^^^^^^^^^^" start_time = time.time() print 'Poem #',sub_cnt poem_replaced = "" replacement_word = "" previous_replacement_word = "" author="" titles="" title="" new_title="" replaced_ls =[] new_titles_ls = [] quit_language=0 oscillator=0 word_cnt=0 # if EXCEPTION is raised... do not add to html SKIP_bool=False ########################## # Load POEM TEXT FILE # ########################## ## # PAUSE ## #time.sleep(5) txt_fn_path = DATA_DIR + READ_TXT_PATH + id.split("_")[1]+".txt" #print "txt_fn_path:",txt_fn_path if os.path.isfile(txt_fn_path) and cnt>0: txt_data=open(txt_fn_path).read() # http://blog.webforefront.com/archives/2011/02/python_ascii_co.html # txt_data.decode('ISO-8859-2') .decode('utf-8') # unicode(txt_data) author=txt_data.split("****!****")[0].strip(' \t\n\r') title=txt_data.split("****!****")[1].strip(' \t\n\r') bio=txt_data.split("****!****")[2]#.strip(' \t\n\r') ###### CLEAN BIO bio.replace("\t","	") bio.replace("\n"," <br>") bio.replace("\r"," <br>") bio.replace("","~~~~!~~~") poem_replaced=bio #print poem_replaced ############################### # REPLACE AUTHOR NAME in poem # ############################### author_ln=author.split(" ")[-1].lstrip() author_fn=author.split(" ")[:-1] author = " ".join(n for n in author_fn)+author_ln # #poem_replaced = poem_replaced.replace(author_ln,"Jhave") ####################### # replace BOOK TITLES # ####################### #print "TITLES"] new_title = getNewTitle("title").encode('utf-8') ####################### # fake AUTHOR # ####################### new_author= " ".join(random.choice(authors).split(" ")[1:-2])+" "+random.choice(authors).split(" ")[-2] #print "new AUTHOR",new_author ############################ # replace years with another ############################ for w1 in poem_replaced.split("("): for w2 in w1.split(")"): if w2 is not None and w2.isdigit(): new_num = random.randint(int(w2)-5,int(w2)+5) #print "REPLACING #:",w2,new_num poem_replaced = poem_replaced.replace(w2,str(new_num)) replaced_ls.append(new_num) ################# # Load JSON # ################# response = loadJSONfile(READ_JSON_PATH+"poetryFoundation_"+id.split("_")[1]+"_Alchemy_JSON.txt") if response != "failed": JSON_alchemy_loaded = True if response.get('entities') is not None: for idx,entity in enumerate(response['entities']): #DATA clean the original words (redundant duplicate but for some reason it works... and is necessary... a kludge of crowbars and bleach) ce = entity['text'].replace("0xc2"," ") ce = ce.replace("0xe2","'") ce = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, ce) ce = ce.encode('utf-8') try: content = ce.decode('utf-8').encode('ascii', 'xmlcharrefreplace') except UnicodeDecodeError: "AAAARGGGGHHH!!!!" if content in poem_replaced: ################################################# # # # Replace similar entities from other JSON # # Using data from ALCHEMY API # # # ################################################# replacement_entity = findSimilarEntityinRandomJSON(content,entity['type']) cr = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, replacement_entity) poem_replaced = poem_replaced.replace(content,replacement_entity) replaced_ls.append(replacement_entity) ########################## # POS REPLACMENT # ########################## token_tuples = nltk.word_tokenize(poem_replaced) tt = nltk.pos_tag(token_tuples) ################# # ADJECTIVES # ################# for i in tt: if "/i" not in i[0] and len(i[0])>3 and i[0] != "died": origw = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, i[0]) origw =import_utilities.strip_punctuation(origw) if i[1]=='JJ' : JJr = random.choice(JJ) # # JJr = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, JJr) # JJr = import_utilities.strip_punctuation(JJr) JJr = import_utilities.moveBeginAndEndPunctuationFromStrToString(i[0],JJr.lstrip().lstrip()) if i[0].istitle(): JJr = JJr.title() poem_replaced = re.sub(r'\b' + import_utilities.strip_punctuation(i[0]) + r'\b', JJr, poem_replaced,1)#poem_replaced.replace(i[0],JJr,1) replaced_ls.append(JJr) if i[1]=='RB': RBr = random.choice(RB) RBr = import_utilities.moveBeginAndEndPunctuationFromStrToString(i[0],RBr.lstrip().lstrip()) if i[0].istitle(): RBr = RBr.title() poem_replaced = re.sub(r'\b' + import_utilities.strip_punctuation(i[0]) + r'\b', RBr, poem_replaced,1) replaced_ls.append(RBr) ######################## # IS IT ENGLISH? # ######################## for line in poem_replaced.split('\n\r'): if len(line)>0 : if "english" not in import_utilities.get_language(line): quit_language+=1 #print "NOT english:",quit_language,line else: quit_language-=1 ######################### # SYNSET REPLACE # ######################### for idx,word in enumerate(poem_replaced.split(' ')): similarterm="" if "<br>" not in word and "	" not in word and len(word)>0: words_total+=1 ######################### # PRONOUN ' VERB # ######################### if len(word.split("'"))>1: if word.split("'")[0] in personal_pronouns: replacement_word = random.choice(personal_pronouns)+"'"+word.split("'")[1]+' ' poem_replaced.replace(word,replacement_word) #print "word,",word,"replacement_word:",replacement_word #################################################### # Replacement of OTHERs # #################################################### elif not word.lower().strip(" \n\t\r") in stopwords.words('english'): # take off leading brackets, commas etc... word_punct_nopunct = import_utilities.strip_punctuation_bool(word) word_nopunct = word_punct_nopunct['word'].strip(" \n\t\r") word_punct = word_punct_nopunct['punct'] punct_bool = word_punct_nopunct['punct_bool'] ####################################################### # MAIN EXCHANGE PROCESS CALL >>>>>>> GET THE SYNSET # ####################################################### if word_nopunct[-4:].lower()=="here": similarterm=random.choice(import_utilities.heres) else: #print "WORD:",word_nopunct if len(word_nopunct)>3: oscillator = oscillator+1 ############################################ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # STYLE SWITCH..... should in future use POS # ... i.e. if noun & oscillator%3, do... # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ############################################ # synset similarterm = import_utilities.synset_creeley(word_nopunct) #print "synset", similarterm if similarterm is not None and similarterm == word_nopunct and len(word_nopunct)>4: #RESERVOIR.sort(key=len) poetry_mouth.sort(key=len) similarterm= poetry_mouth[idx%len(poetry_mouth)]#RESERVOIR[idx%len(RESERVOIR)] #print "NEW",idx,len(RESERVOIR),similarterm,word_nopunct,"PRE>>>>>>>>LAST CHANGE STOP: ", word, "~",similarterm ####################################### # abbreviations for f*****g states! # ####################################### if word_nopunct.upper() in import_utilities.state_abbrev and word_nopunct.lower() not in stopwords.words('english') and "me," not in word: tmp = similarterm if word_nopunct == "oh": similarterm = random.choice(import_utilities.exclaims) else: similarterm = random.choice(poetry_mouth)#RESERVOIR) #print word_nopunct," replaced by", tmp, "replaced with:",similarterm, "in:",line ############## # hyphenated # ############## hyp =word.split("-") #print word,len(hyp) if len(hyp) >1: similarterm="" for w in hyp: if len(w) > 2: if import_utilities.synset_creeley(w) is not None: similarterm += import_utilities.synset_creeley(w)+"-" else: similarterm += w+"-" similarterm = import_utilities.strip_underscore(similarterm[:-1]) #print "hyphenated:",word,"replaced by: "+similarterm ######################################################### # is it a TRUNCATED VERB slang as in singin or wishin # ######################################################### # if similarterm == word_nopunct and len(word)>2 and 'in' in word_nopunct[-2:]: # similarterm = import_utilities.synset_creeley(word_nopunct+'g') # ## #print "TRUNCATED SLANG word: '"+word+"'",similarterm # interim = import_utilities.lemma(similarterm) # ## #print interim # similarterm = import_utilities.conjugate(interim, tense=import_utilities.PARTICIPLE, parse=True)[:-1] # # # # #print word,"widx:",widx," line_pos_tags[widx][0]:",line_pos_tags[widx][0]," line_pos_tags[widx][1]:",line_pos_tags[widx][1] ################# # SWEAR WORD # ################# ##print "at the garden of if:", word if word_nopunct in import_utilities.curses: similarterm = random.choice(import_utilities.curses) ##print "SWEAR WORD word: '"+word+"'",similarterm ############################################ # manually get rid of some terrible choices ############################################ naw_terms=["mind","lonely"] if similarterm == "ilk": similarterm = "like" if similarterm == "Nox": similarterm = "oil" if similarterm == "ope": similarterm = "does" if similarterm == "information technology": similarterm = "it" if similarterm == "velleity": similarterm = "want" if similarterm == "Crataegus laevigata": similarterm = "may" if similarterm == "eff": similarterm = "know" if similarterm == "naw": similarterm = "mind" if similarterm == "lento": similarterm = "slow" #print "SIMILAR:",similarterm if similarterm is not None: if len(hyp) >1: replacement_word = similarterm else: replacement_word = word.replace(word_nopunct, similarterm) replacement_word = import_utilities.strip_underscore(replacement_word) replacement_word = import_utilities.replaceNumbers(replacement_word) else: replacement_word = random.choice(poetry_mouth)#RESERVOIR) ################################ # RESERVOIR_OF_WEIRDNESS # # create a large pool of words # ################################ if word_nopunct.lower() in import_utilities.impera: replacement_word=random.choice(import_utilities.impera) #print word,"IMPERA:",replacement_word elif word_nopunct.lower() in import_utilities.conjuncts: replacement_word=random.choice(import_utilities.conjuncts) #print word," CONJUNCTION replaced with",replacement_word elif word_nopunct.lower() in import_utilities.indef_prono: replacement_word=random.choice(import_utilities.indef_prono) #print word," INDEF_prono replaced with",replacement_word elif word_nopunct.lower() in import_utilities.prepo: replacement_word=random.choice(import_utilities.prepo) #print word," prepo replaced with",replacement_word elif word_nopunct.lower() in import_utilities.rel_prono: replacement_word=word #print word," rel_prono LEAVE alone: ",replacement_word elif word_nopunct.lower()[-2:] =="ly": if import_utilities.synset_creeley(word) is not None: replacement_word=import_utilities.strip_underscore(import_utilities.synset_creeley(word))#(word[:-2]) #print word," ADVERB: ",replacement_word # if replacement_word[-2:] !="ly": # replacement_word +="ly" else: if len(hyp) <2 and "like" not in word_nopunct and import_utilities.singularize(word_nopunct) == import_utilities.singularize(replacement_word) and word_nopunct.lower() not in import_utilities.stopwords_ls: if word not in RESERVOIR and quit_language<0 and import_utilities.countPunctuation(word)<1 and len(word_nopunct)>3 and not word_nopunct.istitle(): #print "ADDING",word,"to reservoir" ################################################# # ADDING ONLY SMALL WORDS # & MAKING A POEM OUT OF THEM ################################################# if len(word)<7 and len(word)>0: small_word = word if random.randint(0,4)==3: small_word +="\n" #print small_word small_word +=" " SMALL_POEM+=small_word RESERVOIR.append(word) #SMALL_POEM_ALL.append(small_word) replacement_word = random.choice(poetry_mouth)#RESERVOIR)#rap_mouth)# RESERVOIR) #print word_nopunct,"replaced from reservoir with", replacement_word # print "'"+word_nopunct+"' vs RESERVOIR replacement_word:",replacement_word #," new_line:",new_line if quit_language>1 and not word_nopunct.istitle(): #print quit_language, "Probably foreign language: make a word salad in english" replacement_word = random.choice(poetry_mouth)#RESERVOIR)#science_mouth)#RESERVOIR) #print word_nopunct,"OTHER replaced from reservoir with", replacement_word ################################################### # MOST REPLACEMENT occurs here... # ################################################### poem_ls = poem_replaced.split(' ') idx = poem_ls.index(word) # print idx,",", poem_ls[idx],",", word ,",",replacement_word #print word ," --- ",previous_replacement_word,replacement_word idx_2 = poem_ls.index(word) # BUG test: is potential replacement a comma or period or empty? if replacement_word.lstrip().rstrip() =="," or replacement_word.lstrip().rstrip() =="" or replacement_word.lstrip().rstrip() ==".": #print "found a comma/empty why?",replacement_word.lstrip().rstrip() replacement_word=random.choice(poetry_mouth) #print "line633 REPLACING with ",replacement_word if poem_ls[idx]==word and poem_ls[idx]==replacement_word: #print "SAME idx-2 replacement_word=",replacement_word # search for same grammatical type the NLTK lists replacement_word= findSamePOS(replacement_word) #print "after findSamePOS replacement_word=",replacement_word #print idx,idx_2," poem_ls[idx_2]=", poem_ls[idx_2]," poem_ls[idx]=", poem_ls[idx]," word=", word ," replacement=",replacement_word if replacement_word == "~~~~!~~~" or poem_ls[idx]== "~~~~!~~~": print "~~~~!~~~ FOUND ******" else: if poem_ls[idx]==word: poem_ls[idx]=replacement_word if poem_ls[idx_2]==word: poem_ls[idx_2]=replacement_word poem_replaced = " ".join(poem_ls) # still the same? try another game if len(word)>5 and replacement_word.lstrip().rstrip() == word_nopunct.lstrip().rstrip(): ################################################# # since word is same as replacement, try alchemy? ################################################# #replacement_entity = findSimilarEntityinRandomJSON(content,entity['type']) # a last ditch pseudo random select # TODO USE THE NLTK LISTS TO SELECT POS WORD # RESERVOIR.sort(key=len) # replacement_word = RESERVOIR[idx%len(RESERVOIR)] #poetry_mouth.sort(key=len) #INSERTION usi #replacement_word = random.choice(poetry_mouth)#[idx%len(poetry_mouth)] replacement_word= findSamePOS(replacement_word) #print "NEWEST POS",idx,len(poetry_mouth),"LAST CHANGE STOP: ", word, "~",replacement_word # check again if poem_ls[idx]==word and poem_ls[idx]==replacement_word: #print "AGAIN SAME idx replacement_word=",replacement_word replacement_word=random.choice(poetry_mouth) #print "line663 AGAIN NEW rand pf=",replacement_word # REPLACE (but catch for weird chars) try: if poem_ls[idx]==word and "****" not in word and "." != word and "\n" not in word: # INSERTION poem_ls[idx]=replacement_word #print "line673 REPLACING",poem_ls[idx]," with ",replacement_word # REASSEMBLE the poem poem_replaced = " ".join(poem_ls) # store this word so that conjugation can be checked previous_replacement_word=replacement_word except Exception, e: #print "PENULTIMATE SKIP_bool replace FAIL",e SKIP_bool=True continue ########################################################################### # testing Pattern.en as parser for conjugation and article replacement # # much more robust than my hand-coded hacks # ########################################################################### # correct CONJUGATion of paticiple verbs with pattern.en parsed = parse(poem_replaced,tags = True) pre_verbal = ["'m","'s","'re"] for idx,p in enumerate(parsed.split(" ")): tok =p.split("/")[0] typ=p.split("/")[1] #print idx,tok,typ if tok in pre_verbal: #print "pre_verbal:",tok next_word= parsed.split(" ")[idx+1].split("/") # try try try for ix,n in enumerate(next_word): next_word[ix] = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, n).encode('utf-8') try: #print next_word,next_word[0],next_word[1][:2] # if it's a verb that follows if next_word[1][:2] =="VB": before_verb = " ".join(w for w in poem_replaced.split(" ")[:idx])#.encode('utf-8') after_verb = " ".join(w for w in poem_replaced.split(" ")[idx+1:])#.encode('utf-8') new_verb = conjugate(next_word[0], tense=PARTICIPLE, parse=True).encode('utf-8') # insert new #print "CONJUGATION needed, changing:",poem_replaced.split(" ")[idx],"to",parsed.split(" ")[idx],poem_replaced.split(" ")[idx-1]+" "+new_verb poem_replaced = before_verb+" "+new_verb+" "+after_verb except Exception, e: # print "INside parsed COnjugation loop",e continue # correct ARTICLES for idx,word in enumerate(poem_replaced.split(" ")): if len(word)>0 and idx != 0 and " " not in word: # A or AN if poem_replaced.split(" ")[idx-1].lower() =="a" or poem_replaced.split(" ")[idx-1].lower() =="an": #print word,"---",article(word)+" "+word before_article = " ".join(w for w in poem_replaced.split(" ")[:idx-1]) after_article = " ".join(w for w in poem_replaced.split(" ")[idx+1:]) new_conj = referenced(word) # capitalize if poem_replaced.split(" ")[idx-1].istitle(): new_conj = new_conj.split(" ")[0].title()+" "+new_conj.split(" ")[1] poem_replaced = before_article+" "+new_conj+" "+after_article ######################### # bug check ,, # ######################### poem_replaced = poem_replaced.replace(",,", ",") poem_replaced = poem_replaced.replace(",.", ",") poem_replaced = poem_replaced.replace(".,", ".") ######################### # WRITE SINGLE POEM # ######################### if not SKIP_bool: tmp_poem="" # poem_replaced.replace("\t","	") # poem_replaced.replace("\n"," <br>") # poem_replaced.replace("\r"," <br>") HTML_poem="" for line in poem_replaced.split("\n"): #print "LINE", line lines_total+=1 HTML_poem += line+"<br>" if len(response) >0 and len(id.split("_"))>1: ALL_poems = "<br>[ A generated-poem based upon: <i>"+ title +"</i> by <b>"+ author+"</b>]<br><br><i>"+new_title+"</i><br> by <b>"+ new_author +"</b><br>"+HTML_poem+ALL_poems.split("</h2>")[1].replace(" "," ") tmp_poem= "\n[A generated-poem based upon: '"+ title+"' by "+ author +"]\n\n"+new_title+ "\nby "+new_author+"\n"+poem_replaced ##################### # # # # # PAUSE IT # # # # # ##################### if (int(sub_cnt)%int(pause_every) == 0 and int(sub_cnt) !=0): time.sleep(int(sleep_time)) ##################### # # # # # PRINT # # # # # ##################### print "\n~~~\n" +tmp_poem+"\n~~~\n" # SLOW TYPEWRITER PRESENTATION # for line in tmp_poem: # for c in line: # time.sleep(0.04) # sys.stdout.write(c)#(c.encode("utf8")) # sys.stdout.flush() # #sys.stdout.write("\n") txt_fn = id.split("_")[1]+"_POEMs.txt" WRITE__PATH = "../../generated/poetryFoundation/"+poem_style+datetime.datetime.now().strftime('%Y-%m-%d_%H')+"/" if not os.path.exists(WRITE__PATH): os.makedirs(WRITE__PATH) txt_fn_path = WRITE__PATH+txt_fn f_txt=open(txt_fn_path,'w') f_txt.write(tmp_poem)#.encode('utf-8')) f_txt.close(); #print "\nTXT file created at:",txt_fn_path WRITE__PATH = "../../generated/poetryFoundation/"+poem_style+"_SMALL_POEMS"+datetime.datetime.now().strftime('%Y-%m-%d_%H')+"/" if not os.path.exists(WRITE__PATH): os.makedirs(WRITE__PATH) txt_fn_path = WRITE__PATH+txt_fn f_txt=open(txt_fn_path,'w') f_txt.write("[A generated-poem based upon: '"+ title+"' by "+ author +"]\n\n"+SMALL_POEM)#.encode('utf-8')) f_txt.close(); SMALL_POEM="" ####### # write them all.... wasteful... but useful if run is interrupted.... ########### # if cnt==1: # ALL_poems = ALL_poems_intro+ALL_poems # else: ALL_poems = ALL_poems_intro+ALL_poems.replace(" "," ") ALL_poems = ALL_poems.replace("$$datetime$$",datetime.datetime.now().strftime('%Y-%m-%d at %H:%M')) ALL_poems = ALL_poems.replace("$$cnt$$",str(cnt)) ALL_poems = ALL_poems.replace("$$style$$",poem_style) ALL_poems = ALL_poems.replace("$$gentime$$",str(time.time() - start_time)) # ALL POEMS txt_fn = datetime.datetime.now().strftime('%Y-%m-%d')+"_BDP_generated_"+poem_style+"_POEMS_"+str(poem_id)+".html" GEN_PATH = GENERATED_DIR+type_of_run+"_html/" if not os.path.exists(GEN_PATH): os.makedirs(GEN_PATH) txt_fn_path = GEN_PATH+txt_fn f_txt=open(txt_fn_path,'w') f_txt.write(ALL_poems+"</hmtl>") f_txt.close(); #print "\nTXT file created at:",txt_fn_path # except Exception, e: # print "At the final LOOP",e # #continue # pass else: pass #print "~! EMPTY response:", author else: cnt = cnt-1