Python moveBeginAndEndPunctuationFromStrToString Exemples, import_utilities.moveBeginAndEndPunctuationFromStrToString Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : ELO2015_Alchemy_poetryFoundation_HATCHER-input-pause.py Projet : Aranjedeath/Big-Data-Poetry

def extractFeaturesAndWriteBio(READ_PATH, file_type):

    global ALL_poems, bio, cnt, start_time

    inp = 0
    sub_cnt = 0
    words_total = 0
    lines_total = 0

    for subdir, dirs, files in os.walk(READ_PATH):
        for file in files:

            num_of_files = len(files) - 1  # deduct the DS_store
            #print (num_of_files,'readDirectory',READ_PATH)

            if file_type in file and 'readme' not in file:

                # ID
                id = file.split(".")[0]
                #print "\n\n*********\nID:",id

                filenames.append(id)
                cnt += 1

                # print('')
                #print('')
                # print('OPENED:',id)
                # print('')
                #print('')

                ##############
                #  HOW MANY? #
                ##############
                sub_cnt += 1
                if sub_cnt >= inp:
                    if inp != 0:
                        end_time = time.time()
                        es = end_time - start_time
                        print sub_cnt, "poems,\n", lines_total, "lines, &\n", words_total, "words \ngenerated in\n", (
                            "%.2f" % es), "seconds"

                    words_total = 0
                    lines_total = 0

                    # RESTART

                    sub_cnt = 0
                    inp = input(
                        "\n\n^^^^^^^^^^^^^^\n\nHow many poems do u want? ")
                    print "\n\n^^^^^^^^^^^^^^^"
                    start_time = time.time()

                print 'Poem #', sub_cnt + 1

                poem_replaced = ""
                replacement_word = ""
                author = ""
                titles = ""
                title = ""
                new_title = ""

                replaced_ls = []
                new_titles_ls = []
                quit_language = 0

                #################################################################
                # Load  POEM TEXT FILE (based on id extracted from Alchemy JSON)    #
                #################################################################

                txt_fn_path = DATA_DIR + READ_TXT_PATH + id.split(
                    "_")[1] + ".txt"
                #print "txt_fn_path:",txt_fn_path

                if os.path.isfile(txt_fn_path) and cnt > 0:
                    txt_data = open(txt_fn_path).read()

                    # http://blog.webforefront.com/archives/2011/02/python_ascii_co.html
                    # txt_data.decode('ISO-8859-2') .decode('utf-8')
                    # unicode(txt_data)

                    author = txt_data.split("****!****")[0].strip(' \t\n\r')

                    title = txt_data.split("****!****")[1].strip(' \t\n\r')

                    bio = txt_data.split("****!****")[2]  #.strip(' \t\n\r')

                    ######  CLEAN BIO
                    bio.replace("\t", "&#9;")
                    bio.replace("\n", " <br>")
                    bio.replace("\r", " <br>")
                    poem_replaced = bio
                    #print poem_replaced

                    ###############################
                    # REPLACE AUTHOR NAME
                    ##############################
                    author_ln = author.split(" ")[-1]
                    author_fn = author.split(" ")[:-1]
                    #
                    #poem_replaced = poem_replaced.replace(author_ln,"Jhave")

                    #######################
                    # fake AUTHOR
                    #######################

                    new_author = " ".join(
                        random.choice(authors).split(" ")
                        [1:-2]) + " " + random.choice(authors).split(" ")[-2]

                    #######################
                    # replace BOOK TITLES
                    #######################
                    #print "TITLES"]
                    new_title = getNewTitle("title").encode('utf-8')

                    ############################
                    # replace years with another
                    ############################
                    for w1 in poem_replaced.split("("):
                        for w2 in w1.split(")"):
                            if w2 is not None and w2.isdigit():
                                new_num = random.randint(
                                    int(w2) - 5,
                                    int(w2) + 5)
                                #print "REPLACING #:",w2,new_num
                                poem_replaced = poem_replaced.replace(
                                    w2, str(new_num))
                                replaced_ls.append(new_num)

                    #################
                    # Load JSON     #
                    #################
                    response = loadJSONfile(READ_JSON_PATH +
                                            "poetryFoundation_" +
                                            id.split("_")[1] +
                                            "_Alchemy_JSON.txt")

                    if response != "failed":

                        if response.get('entities') is not None:
                            for idx, entity in enumerate(response['entities']):

                                #print idx
                                ce = entity['text'].replace("0xc2", " ")
                                ce = ce.replace("0xe2", "'")
                                ce = re.sub(
                                    '(' +
                                    '|'.join(import_utilities.chars.keys()) +
                                    ')', import_utilities.replace_chars, ce)
                                ce = ce.encode('utf-8')

                                try:
                                    content = ce.decode('utf-8').encode(
                                        'ascii', 'xmlcharrefreplace')
                                except UnicodeDecodeError:
                                    "AAAARGGGGHHH!!!!"

                                if content in poem_replaced:

                                    ################################################
                                    # Replace similar entities from other JSON     #
                                    ################################################
                                    replacement_entity = findSimilarEntityinRandomJSON(
                                        content, entity['type'])

                                    cr = re.sub(
                                        '(' + '|'.join(
                                            import_utilities.chars.keys()) +
                                        ')', import_utilities.replace_chars,
                                        replacement_entity)

                                    poem_replaced = poem_replaced.replace(
                                        content, replacement_entity)
                                    replaced_ls.append(replacement_entity)

                    ##########################
                    #   POS REPLACMENT       #
                    ##########################

                    token_tuples = nltk.word_tokenize(poem_replaced)
                    tt = nltk.pos_tag(token_tuples)

                    #################
                    #  ADJECTIVES   #
                    #################
                    for i in tt:
                        if "/i" not in i[0] and len(
                                i[0]) > 3 and i[0] != "died":
                            origw = re.sub(
                                '(' + '|'.join(import_utilities.chars.keys()) +
                                ')', import_utilities.replace_chars, i[0])
                            origw = import_utilities.strip_punctuation(origw)
                            if i[1] == 'JJ':
                                JJr = random.choice(JJ)
                                # # JJr =  re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, JJr)
                                # JJr = import_utilities.strip_punctuation(JJr)
                                JJr = import_utilities.moveBeginAndEndPunctuationFromStrToString(
                                    i[0],
                                    JJr.lstrip().lstrip())

                                if i[0].istitle():
                                    JJr = JJr.title()

                                poem_replaced = re.sub(
                                    r'\b' +
                                    import_utilities.strip_punctuation(i[0]) +
                                    r'\b', JJr, poem_replaced,
                                    1)  #poem_replaced.replace(i[0],JJr,1)
                                replaced_ls.append(JJr)
                            if i[1] == 'RB':
                                RBr = random.choice(RB)
                                RBr = import_utilities.moveBeginAndEndPunctuationFromStrToString(
                                    i[0],
                                    RBr.lstrip().lstrip())

                                if i[0].istitle():
                                    RBr = RBr.title()
                                poem_replaced = re.sub(
                                    r'\b' +
                                    import_utilities.strip_punctuation(i[0]) +
                                    r'\b', RBr, poem_replaced, 1)
                                replaced_ls.append(RBr)

                    ########################
                    # IS IT ENGLISH?       #
                    ########################
                    for line in poem_replaced.split('\n\r'):

                        if len(line) > 0:
                            if "english" not in import_utilities.get_language(
                                    line):
                                quit_language += 1
                                #print "NOT english:",quit_language,line
                            else:
                                quit_language -= 1

                    #########################
                    #   SYNSET REPLACE      #
                    #########################
                    for idx, word in enumerate(poem_replaced.split(' ')):

                        if "<br>" not in word and "&#9;" not in word and len(
                                word) > 0 and "~~~~!~~~" not in word:

                            words_total += 1

                            #########################
                            #   PRONOUN ' VERB      #
                            #########################
                            if len(word.split("'")) > 1:
                                if word.split("'")[0] in personal_pronouns:
                                    replacement_word = random.choice(
                                        personal_pronouns) + "'" + word.split(
                                            "'")[1] + ' '
                                poem_replaced.replace(word, replacement_word)
                                #print "word,",word,"replacement_word:",replacement_word

                            ####################################################
                            # Replacement of OTHERs                            #
                            ####################################################

                            elif not word.lower().strip(
                                    " \n\t\r") in stopwords.words('english'):

                                # take off leading brackets, commas etc...
                                word_punct_nopunct = import_utilities.strip_punctuation_bool(
                                    word)
                                word_nopunct = word_punct_nopunct[
                                    'word'].strip(" \n\t\r")
                                word_punct = word_punct_nopunct['punct']
                                punct_bool = word_punct_nopunct['punct_bool']

                                #######################################################
                                # MAIN EXCHANGE PROCESS CALL >>>>>>>   GET THE SYNSET #
                                #######################################################
                                if word_nopunct[-4:].lower() == "here":
                                    similarterm = random.choice(
                                        import_utilities.heres)
                                else:
                                    #print "WORD:",word_nopunct
                                    if len(word_nopunct) > 2:
                                        similarterm = import_utilities.find_synset_word(
                                            word_nopunct
                                        )  #(word.lstrip().rstrip())

                                ############################################
                                # manually get rid of some terrible choices
                                ############################################
                                if similarterm == "ilk":
                                    ##print "like"
                                    similarterm = "like"
                                if similarterm == "ope":
                                    ##print "doth"
                                    similarterm = "does"
                                if similarterm == "information technology":
                                    ##print "doth"
                                    similarterm = "it"

                                #######################################
                                # abbreviations for f*****g states!   #
                                #######################################
                                if word_nopunct.upper(
                                ) in import_utilities.state_abbrev and word_nopunct.lower(
                                ) not in stopwords.words(
                                        'english') and "me," not in word:
                                    tmp = similarterm
                                    if word_nopunct == "oh":
                                        similarterm = random.choice(
                                            import_utilities.exclaims)
                                    else:
                                        similarterm = random.choice(RESERVOIR)
                                    #print word_nopunct," replaced by", tmp, "replaced with:",similarterm, "in:",line

                                ##############
                                # hyphenated #
                                ##############
                                hyp = word.split("-")
                                #print word,len(hyp)
                                if len(hyp) > 1:
                                    similarterm = ""
                                    for w in hyp:
                                        if len(w) > 2:
                                            similarterm += import_utilities.find_synset_word(
                                                w) + "-"
                                    similarterm = import_utilities.strip_underscore(
                                        similarterm[:-1])
                                    #print "hyphenated:",word,"replaced by: "+similarterm

                                #########################################################
                                # is it a TRUNCATED VERB slang as in singin or wishin   #
                                #########################################################
                                if similarterm == word_nopunct and len(
                                        word
                                ) > 2 and 'in' in word_nopunct[-2:]:
                                    similarterm = import_utilities.find_synset_word(
                                        word_nopunct + 'g')
                                    ## #print "TRUNCATED SLANG word: '"+word+"'",similarterm
                                    interim = import_utilities.lemma(
                                        similarterm)
                                    ## #print interim
                                    similarterm = import_utilities.conjugate(
                                        interim,
                                        tense=import_utilities.PARTICIPLE,
                                        parse=True)[:-1]
                                    # # # #print word,"widx:",widx," line_pos_tags[widx][0]:",line_pos_tags[widx][0]," line_pos_tags[widx][1]:",line_pos_tags[widx][1]

                                #################
                                # SWEAR WORD    #
                                #################
                                ##print "at the garden of if:", word
                                if word_nopunct in import_utilities.curses:
                                    similarterm = random.choice(
                                        import_utilities.curses)
                                    ##print "SWEAR WORD word: '"+word+"'",similarterm

                                if len(hyp) > 1:
                                    replacement_word = similarterm
                                else:
                                    replacement_word = word.replace(
                                        word_nopunct, similarterm)
                                    replacement_word = import_utilities.strip_underscore(
                                        replacement_word)
                                    replacement_word = import_utilities.replaceNumbers(
                                        replacement_word)

                                #########################
                                # RESERVOIR_OF_WEIRDNESS  #
                                #########################

                                if word_nopunct.lower(
                                ) in import_utilities.impera:
                                    replacement_word = random.choice(
                                        import_utilities.impera)
                                    #print word,"IMPERA:",replacement_word
                                elif word_nopunct.lower(
                                ) in import_utilities.conjuncts:
                                    replacement_word = random.choice(
                                        import_utilities.conjuncts)
                                    #print word," CONJUNCTION replaced with",replacement_word
                                elif word_nopunct.lower(
                                ) in import_utilities.indef_prono:
                                    replacement_word = random.choice(
                                        import_utilities.indef_prono)
                                    #print word," INDEF_prono replaced with",replacement_word
                                elif word_nopunct.lower(
                                ) in import_utilities.prepo:
                                    replacement_word = random.choice(
                                        import_utilities.prepo)
                                    #print word," prepo replaced with",replacement_word
                                elif word_nopunct.lower(
                                ) in import_utilities.rel_prono:
                                    replacement_word = word
                                    #print word," rel_prono LEAVE alone: ",replacement_word
                                elif word_nopunct.lower()[-2:] == "ly":
                                    replacement_word = import_utilities.strip_underscore(
                                        import_utilities.find_synset_word(
                                            word))  #(word[:-2])
                                    #print word," ADVERB: ",replacement_word
                                    # if replacement_word[-2:] !="ly":
                                    #     replacement_word +="ly"

                                else:
                                    if len(
                                            hyp
                                    ) < 2 and "like" not in word_nopunct and import_utilities.singularize(
                                            word_nopunct
                                    ) == import_utilities.singularize(
                                            replacement_word
                                    ) and word_nopunct.lower(
                                    ) not in import_utilities.stopwords_ls:

                                        if word_nopunct not in RESERVOIR and quit_language < 0 and import_utilities.countPunctuation(
                                                word
                                        ) < 1 and len(
                                                word_nopunct
                                        ) > 3 and not word_nopunct.istitle():

                                            #print "ADDING",word,"to reservoir"
                                            RESERVOIR.append(word)

                                            replacement_word = random.choice(
                                                RESERVOIR)
                                            #print word_nopunct,"replaced from reservoir with", replacement_word
                                    # print "'"+word_nopunct+"'  vs RESERVOIR  replacement_word:",replacement_word #,"    new_line:",new_line
                                if quit_language > 1 and not word_nopunct.istitle(
                                ):
                                    #print quit_language, "Probably foreign language: make a word salad in english"
                                    replacement_word = random.choice(RESERVOIR)
                                    #print word_nopunct,"OTHER replaced from reservoir with", replacement_word

                                # REPLACEMENT
                                poem_ls = poem_replaced.split(' ')
                                idx = poem_ls.index(word)

                                # #print idx,",", poem_ls[idx],",", word ,",",replacement_word

                                if poem_ls[idx] == word:
                                    poem_ls[idx] = replacement_word
                                poem_replaced = " ".join(poem_ls)

                                #poem_replaced = poem_replaced.replace(word,replacement_word)

                    # CORRECT the "A" to "An"
                    for idx, word in enumerate(poem_replaced.split(" ")):
                        # poem_replaced = poem_replaced+"A organism"
                        if len(word) > 0 and word[0].lower(
                        ) in the_vowels and poem_replaced.split(" ")[
                                idx - 1].lower() == "a":
                            if poem_replaced.split(" ")[idx - 1] == "a":
                                old_str = "a " + poem_replaced.split(" ")[idx]
                                new_str = "an " + poem_replaced.split(" ")[idx]
                            else:
                                old_str = "A " + poem_replaced.split(" ")[idx]
                                new_str = "An " + poem_replaced.split(" ")[idx]
                            poem_replaced = poem_replaced.replace(
                                old_str, new_str)

                        # poem_replaced = poem_replaced+"An consonant"
                        if len(word) > 0 and word[0].lower(
                        ) not in the_vowels and poem_replaced.split(" ")[
                                idx - 1].lower() == "an":
                            if poem_replaced.split(" ")[idx - 1] == "an":
                                old_str = "an " + poem_replaced.split(" ")[idx]
                                new_str = "a " + poem_replaced.split(" ")[idx]
                            else:
                                old_str = "An " + poem_replaced.split(" ")[idx]
                                new_str = "A " + poem_replaced.split(" ")[idx]
                            poem_replaced = poem_replaced.replace(
                                old_str, new_str)
                            #print "FOUND correction needed",old_str,new_str

                    #########################
                    #   WRITE SINGLE POEM   #
                    #########################
                    tmp_poem = ""

                    # poem_replaced.replace("\t","&#9;")
                    # poem_replaced.replace("\n"," <br>")
                    # poem_replaced.replace("\r"," <br>")

                    HTML_poem = ""
                    for line in poem_replaced.split("\n"):
                        lines_total += 1
                        #print "LINE", line
                        HTML_poem += line + "<br>"

                    if len(response) > 0 and len(id.split("_")) > 1:
                        # ALL_poems = ALL_poems_intro + " ".join(i for i in ALL_poems.split("</h2>.")[0:])+"<br><br>~~~~~~~~~~~~~~~~~~~~~~~~~~<br>[ A poem generated from template : <b>"+ author+"</b>, <i>"+ title +"</i> ]<br><br><b>"+new_title+"<br><br></b>"+HTML_poem
                        ALL_poems += "<br><br>~~~~~~~~~~~~~~~~~~~~~~~~~~<br>[ A poem generated from template : <b>" + author + "</b>, <i>" + title + "</i> ]<br><br><b>" + new_title + "<br><br></b>" + HTML_poem

                        tmp_poem = "[A poem generated from template: " + author + ", '" + title + "'']\n\n'" + new_title + "'\nby\n" + new_author + "\n\n" + poem_replaced

                        #####################
                        #                   #
                        #                   #
                        #     PAUSE IT      #
                        #                   #
                        #                   #
                        #####################

                        # sleep_time=0.03*sub_cnt
                        print "sub_cnt=", sub_cnt  # ,"sleep_time=",sleep_time
                        # time.sleep(sleep_time)

                        if sub_cnt >= 1:
                            raw_input("Press Enter to continue...")

                        #####################
                        #                   #
                        #                   #
                        #       PRINT       #
                        #                   #
                        #                   #
                        #####################

                        print "\n******\n" + tmp_poem

                        txt_fn = id.split("_")[1] + "_POEMs.txt"

                        # WRITE_BIO_PATH = DATA_DIR+"generated/POEMS/POEMS_"+datetime.datetime.now().strftime('%Y-%m-%d_%H')+"/"
                        # if not os.path.exists(WRITE_BIO_PATH):
                        #         os.makedirs(WRITE_BIO_PATH)

                        txt_fn_path = GENERATED_DIR + txt_fn
                        f_txt = open(txt_fn_path, 'w')
                        f_txt.write(tmp_poem)  #.encode('utf-8'))
                        f_txt.close()
                        #print "\nTXT file created at:",txt_fn_path

                        # #######
                        # #   write them all.... wasteful... but useful if run is interrupted....
                        # ###########
                        # ALL_poems = ALL_poems.replace("$$datetime$$",datetime.datetime.now().strftime('%Y-%m-%d at %H:%M'))
                        # ALL_poems = ALL_poems.replace("$$cnt$$",str(cnt))
                        # print "cnt",cnt
                        # ALL_poems = ALL_poems.replace("$$gentime$$",str(time.time() - start_time))

                        # # ALL POEMS
                        # txt_fn = datetime.datetime.now().strftime('%Y-%m-%d_%H')+"_poetryFoundation_generatedPOEMS_"+type_of_run+".html"
                        # txt_fn_path = DATA_DIR+"generated/POEMS/"+txt_fn
                        # f_txt=open(txt_fn_path,'w')
                        # f_txt.write(ALL_poems+"</hmtl>")
                        # f_txt.close();
                        # print "\nTXT file created at:",txt_fn_path

                    else:
                        "~~~~~~~~~~~~~~~~!!!!!!!!!! EMPTY response:", author

Exemple #2

0

Afficher le fichier

Fichier : Alchemy_poetryFoundation_STEP3_POEMS_FEATURE_EXTRACTION_PERVERTED_LOVE_VERSION.py Projet : jhave/DS_HK_2

def extractFeaturesAndWriteBio(READ_PATH, file_type):

    global ALL_poems, bio, cnt

    for subdir, dirs, files in os.walk(READ_PATH):
        for file in files:

            num_of_files = len(files) - 1  # deduct the DS_store
            #print (num_of_files,'readDirectory',READ_PATH)

            if file_type in file and 'readme' not in file:

                # ID
                id = file.split(".")[0]
                print "\nID:", id.split("_")[1]

                filenames.append(id)
                cnt += 1

                # print('')
                # print('')
                # print('OPENED:',id)
                # print('')
                # print('')

                poem_replaced = ""
                replacement_word = ""
                previous_replacement_word = ""

                author = ""
                titles = ""
                title = ""
                new_title = ""

                replaced_ls = []
                new_titles_ls = []
                quit_language = 0
                oscillator = 0

                # if EXCEPTION is raised... do not add to html
                SKIP_bool = False

                ##########################
                # Load  POEM TEXT FILE     #
                ##########################

                txt_fn_path = DATA_DIR + READ_TXT_PATH + id.split(
                    "_")[1] + ".txt"
                #print "txt_fn_path:",txt_fn_path

                if os.path.isfile(txt_fn_path) and cnt > 0:
                    txt_data = open(txt_fn_path).read()

                    # http://blog.webforefront.com/archives/2011/02/python_ascii_co.html
                    # txt_data.decode('ISO-8859-2') .decode('utf-8')
                    # unicode(txt_data)

                    author = txt_data.split("****!****")[0].strip(' \t\n\r')

                    title = txt_data.split("****!****")[1].strip(' \t\n\r')

                    bio = txt_data.split("****!****")[2]  #.strip(' \t\n\r')

                    ######  CLEAN BIO
                    bio.replace("\t", "&#9;")
                    bio.replace("\n", " <br>")
                    bio.replace("\r", " <br>")
                    poem_replaced = bio
                    #print poem_replaced

                    ###############################
                    # REPLACE AUTHOR NAME in poem
                    ##############################
                    author_ln = author.split(" ")[-1].lstrip()
                    author_fn = author.split(" ")[:-1]
                    author = " ".join(n for n in author_fn) + author_ln
                    #
                    #poem_replaced = poem_replaced.replace(author_ln,"Jhave")

                    #######################
                    # replace BOOK TITLES
                    #######################
                    #print "TITLES"]
                    new_title = getNewTitle("title").encode('utf-8')

                    #######################
                    # fake AUTHOR
                    #######################

                    new_author = " ".join(
                        random.choice(authors).split(" ")
                        [1:-2]) + " " + random.choice(authors).split(" ")[-2]
                    #print "new AUTHOR",new_author

                    ############################
                    # replace years with another
                    ############################
                    for w1 in poem_replaced.split("("):
                        for w2 in w1.split(")"):
                            if w2 is not None and w2.isdigit():
                                new_num = random.randint(
                                    int(w2) - 5,
                                    int(w2) + 5)
                                #print "REPLACING #:",w2,new_num
                                poem_replaced = poem_replaced.replace(
                                    w2, str(new_num))
                                replaced_ls.append(new_num)

                    #################
                    # Load JSON     #
                    #################
                    response = loadJSONfile(READ_JSON_PATH +
                                            "poetryFoundation_" +
                                            id.split("_")[1] +
                                            "_Alchemy_JSON.txt")

                    if response != "failed":

                        if response.get('entities') is not None:
                            for idx, entity in enumerate(response['entities']):

                                #print idx
                                ce = entity['text'].replace("0xc2", " ")
                                ce = ce.replace("0xe2", "'")
                                ce = re.sub(
                                    '(' +
                                    '|'.join(import_utilities.chars.keys()) +
                                    ')', import_utilities.replace_chars, ce)
                                ce = ce.encode('utf-8')

                                try:
                                    content = ce.decode('utf-8').encode(
                                        'ascii', 'xmlcharrefreplace')
                                except UnicodeDecodeError:
                                    "AAAARGGGGHHH!!!!"

                                if content in poem_replaced:

                                    ################################################
                                    # Replace similar entities from other JSON     #
                                    ################################################
                                    replacement_entity = findSimilarEntityinRandomJSON(
                                        content, entity['type'])

                                    cr = re.sub(
                                        '(' + '|'.join(
                                            import_utilities.chars.keys()) +
                                        ')', import_utilities.replace_chars,
                                        replacement_entity)

                                    poem_replaced = poem_replaced.replace(
                                        content, replacement_entity)
                                    replaced_ls.append(replacement_entity)

                    ##########################
                    #   POS REPLACMENT       #
                    ##########################

                    token_tuples = nltk.word_tokenize(poem_replaced)
                    tt = nltk.pos_tag(token_tuples)

                    #################
                    #  ADJECTIVES   #
                    #################
                    for i in tt:
                        if "/i" not in i[0] and len(
                                i[0]) > 2 and i[0] != "died":
                            origw = re.sub(
                                '(' + '|'.join(import_utilities.chars.keys()) +
                                ')', import_utilities.replace_chars, i[0])
                            origw = import_utilities.strip_punctuation(origw)
                            if i[1] == 'JJ':
                                JJr = random.choice(JJ)
                                # # JJr =  re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, JJr)
                                # JJr = import_utilities.strip_punctuation(JJr)
                                JJr = import_utilities.moveBeginAndEndPunctuationFromStrToString(
                                    i[0],
                                    JJr.lstrip().lstrip())

                                if i[0].istitle():
                                    JJr = JJr.title()

                                poem_replaced = re.sub(
                                    r'\b' +
                                    import_utilities.strip_punctuation(i[0]) +
                                    r'\b', JJr, poem_replaced,
                                    1)  #poem_replaced.replace(i[0],JJr,1)
                                replaced_ls.append(JJr)
                            if i[1] == 'RB':
                                RBr = random.choice(RB)
                                RBr = import_utilities.moveBeginAndEndPunctuationFromStrToString(
                                    i[0],
                                    RBr.lstrip().lstrip())

                                if i[0].istitle():
                                    RBr = RBr.title()
                                poem_replaced = re.sub(
                                    r'\b' +
                                    import_utilities.strip_punctuation(i[0]) +
                                    r'\b', RBr, poem_replaced, 1)
                                replaced_ls.append(RBr)
                                #print "RBr=",RBr,"repaced",i[0]

                    ########################
                    # IS IT ENGLISH?       #
                    ########################
                    for line in poem_replaced.split('\n\r'):
                        if len(line) > 0:
                            if "english" not in import_utilities.get_language(
                                    line):
                                quit_language += 1
                                #print "NOT english:",quit_language,line
                            else:
                                quit_language -= 1

                    #########################
                    #   SYNSET REPLACE      #
                    #########################
                    for idx, word in enumerate(poem_replaced.split(' ')):

                        if "<br>" not in word and "&#9;" not in word and len(
                                word) > 0:

                            #########################
                            #   PRONOUN ' VERB      #
                            #########################
                            if len(word.split("'")) > 1:
                                if word.split("'")[0] in personal_pronouns:
                                    replacement_word = random.choice(
                                        personal_pronouns) + "'" + word.split(
                                            "'")[1] + ' '
                                poem_replaced.replace(word, replacement_word)
                                #print "word,",word,"replacement_word:",replacement_word

                            ####################################################
                            # Replacement of OTHERs                            #
                            ####################################################

                            else:
                                # elif not word.lower().strip(" \n\t\r") in stopwords.words('english'):

                                # take off leading brackets, commas etc...
                                word_punct_nopunct = import_utilities.strip_punctuation_bool(
                                    word)
                                word_nopunct = word_punct_nopunct[
                                    'word'].strip(" .\n\t\r")
                                word_punct = word_punct_nopunct['punct']
                                punct_bool = word_punct_nopunct['punct_bool']

                                #print "word_nopunct:",word_nopunct

                                #######################################################
                                # MAIN EXCHANGE PROCESS CALL >>>>>>>   GET THE SYNSET #
                                #######################################################
                                similarterm = ""
                                if word_nopunct[-4:].lower() == "here":
                                    similarterm = random.choice(
                                        import_utilities.heres)
                                else:
                                    #print "WORD:",word_nopunct
                                    if len(word_nopunct) > 3:

                                        oscillator = oscillator + 1

                                        ############################################
                                        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                        # STYLE SWITCH..... should in future use POS
                                        # ... i.e. if noun & oscillator%3, do...
                                        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                        ############################################

                                        similarterm = import_utilities.find_synset_word(
                                            word_nopunct)

                                        # synset
                                        # if oscillator%4==0:
                                        #     # SYNSET
                                        #     similarterm = import_utilities.find_synset_word(word_nopunct)
                                        #     #print "synset", similarterm

                                        # elif oscillator%3==0:
                                        #     # RAP MOUTH
                                        #     similarterm = random.choice(rap_mouth)
                                        #     #print "rap",similarterm

                                        # # elif oscillator%2==0:
                                        # else:
                                        #     similarterm = import_utilities.find_synset_word(word_nopunct)
                                        # # SCIENCE MOUTH
                                        # similarterm = random.choice(science_mouth)
                                        # if similarterm.endswith('logy'):
                                        #         similarterm = similarterm[:-4]
                                        # if similarterm.endswith('o'):
                                        #         similarterm = similarterm[:-1]
                                        #print "science_mouth",similarterm
                                        # if len(similarterm)<6:
                                        #     similarterm = random.choice(import_utilities.curses)

                                        # else:
                                        #     # FILTH
                                        #     print "filth"
                                        #     similarterm = random.choice(import_utilities.curses)

                                ############################################
                                # manually get rid of some terrible choices
                                ############################################
                                if similarterm == "ilk":
                                    ##print "like"
                                    similarterm = "like"
                                if similarterm == "ope":
                                    ##print "doth"
                                    similarterm = "does"
                                if similarterm == "information technology":
                                    ##print "doth"
                                    similarterm = "it"
                                if similarterm == "velleity":
                                    ##print "doth"
                                    similarterm = "want"
                                if similarterm == "Crataegus laevigata":
                                    ##print "doth"
                                    similarterm = "may"
                                if similarterm == "brunet" or similarterm == "ot":
                                    ##print "doth"
                                    similarterm = random.choice(
                                        import_utilities.curses)
                                if similarterm == "ge":
                                    ##print "doth"
                                    similarterm = random.choice(science_mouth)
                                if similarterm.lower() == "nox":
                                    ##print "doth"
                                    similarterm = random.choice(science_mouth)
                                if similarterm.lower() == "paunited":
                                    print "################### paUnited ###################"
                                    similarterm = word

                                #######################################
                                # abbreviations for f*****g states!   #
                                #######################################
                                if word_nopunct.upper(
                                ) in import_utilities.state_abbrev and word_nopunct.lower(
                                ) not in stopwords.words(
                                        'english') and "me," not in word:
                                    tmp = similarterm
                                    if word_nopunct == "oh":
                                        similarterm = random.choice(
                                            import_utilities.exclaims)
                                    else:
                                        similarterm = random.choice(
                                            rap_mouth)  # RESERVOIR)RESERVOIR)
                                    #print word_nopunct," replaced by", tmp, "replaced with:",similarterm, "in:",line

                                ##############
                                # hyphenated #
                                ##############
                                hyp = word.split("-")
                                #print word,len(hyp)
                                if len(hyp) > 1:
                                    similarterm = ""
                                    for w in hyp:
                                        if len(w) > 2:
                                            similarterm += import_utilities.find_synset_word(
                                                w) + "-"
                                    similarterm = import_utilities.strip_underscore(
                                        similarterm[:-1])
                                    #print "hyphenated:",word,"replaced by: "+similarterm

                                #########################################################
                                # is it a TRUNCATED VERB slang as in singin or wishin   #
                                #########################################################
                                if similarterm == word_nopunct and len(
                                        word
                                ) > 2 and 'in' in word_nopunct[-2:]:
                                    similarterm = import_utilities.find_synset_word(
                                        word_nopunct + 'g')
                                    #print "TRUNCATED SLANG word: '"+word+"'",similarterm
                                    interim = import_utilities.lemma(
                                        similarterm)
                                    ## #print interim
                                    similarterm = import_utilities.conjugate(
                                        interim,
                                        tense=import_utilities.PARTICIPLE,
                                        parse=True)[:-1]
                                    # # # #print word,"widx:",widx," line_pos_tags[widx][0]:",line_pos_tags[widx][0]," line_pos_tags[widx][1]:",line_pos_tags[widx][1]

                                #################
                                # SWEAR WORD    #
                                #################
                                ##print "at the garden of if:", word
                                if word_nopunct in import_utilities.curses:
                                    similarterm = random.choice(
                                        import_utilities.curses)
                                    #print "SWEAR WORD word: '"+word+"'",similarterm

                                if len(hyp) > 1:
                                    replacement_word = similarterm
                                else:
                                    replacement_word = word.replace(
                                        word_nopunct, similarterm)
                                    replacement_word = import_utilities.strip_underscore(
                                        replacement_word)
                                    replacement_word = import_utilities.replaceNumbers(
                                        replacement_word)

                                #print "replacement_word:",replacement_word

                                #########################
                                # RESERVOIR_OF_WEIRDNESS  #
                                #########################

                                if word_nopunct.lower(
                                ) in import_utilities.impera:
                                    replacement_word = random.choice(
                                        import_utilities.impera)
                                    #print word,"IMPERA:",replacement_word
                                elif word_nopunct.lower(
                                ) in import_utilities.conjuncts:
                                    replacement_word = random.choice(
                                        import_utilities.conjuncts)
                                    #print word," CONJUNCTION replaced with",replacement_word
                                elif word_nopunct.lower(
                                ) in import_utilities.indef_prono:
                                    replacement_word = random.choice(
                                        import_utilities.indef_prono)
                                    #print word," INDEF_prono replaced with",replacement_word
                                elif word_nopunct.lower(
                                ) in import_utilities.prepo:
                                    replacement_word = random.choice(
                                        import_utilities.prepo)
                                    #print word," prepo replaced with",replacement_word
                                elif word_nopunct.lower(
                                ) in import_utilities.rel_prono:
                                    replacement_word = word
                                    #print word," rel_prono LEAVE alone: ",replacement_word
                                elif word_nopunct.lower(
                                )[-2:] == "ly" or word_nopunct.lower(
                                )[-3:] == "ly.":
                                    replacement_word = import_utilities.strip_underscore(
                                        import_utilities.find_synset_word(
                                            word))  #(word[:-2])
                                    #print word," ADVERB: ",replacement_word
                                    # if replacement_word[-2:] !="ly":
                                    #     replacement_word +="ly"

                                else:
                                    if len(
                                            hyp
                                    ) < 2 and "like" not in word_nopunct and import_utilities.singularize(
                                            word_nopunct
                                    ) == import_utilities.singularize(
                                            replacement_word
                                    ) and word_nopunct.lower(
                                    ) not in import_utilities.stopwords_ls:

                                        if word not in RESERVOIR and import_utilities.countPunctuation(
                                                word
                                        ) < 1 and len(
                                                word_nopunct
                                        ) > 3 and not word_nopunct.istitle():

                                            if len(
                                                    word
                                            ) > 4 and english_dict.check(word):
                                                #print "ADDING",word,"to reservoir"
                                                RESERVOIR.append(word)
                                                #RESERVOIR = list(set())

                                            replacement_word = random.choice(
                                                RESERVOIR)
                                            #print word_nopunct,"replaced from reservoir with", replacement_word
                                    # print "'"+word_nopunct+"'  vs RESERVOIR  replacement_word:",replacement_word #,"    new_line:",new_line
                                if quit_language > 1 and not word_nopunct.istitle(
                                ):
                                    #print quit_language, "Probably foreign language: make a word salad in english"
                                    replacement_word = random.choice(
                                        rap_mouth)  #RESERVOIR)
                                    #print word_nopunct,"OTHER replaced from reservoir with", replacement_word

                                ###################################################
                                # MOST REPLACEMENT occurs here...                 #
                                ###################################################
                                poem_ls = poem_replaced.split(' ')
                                idx = poem_ls.index(word)

                                #print idx,",", poem_ls[idx],",", word ,",",replacement_word
                                #print word ," --- ",previous_replacement_word,replacement_word

                                try:
                                    #print "poem_ls[idx]",poem_ls[idx],"word",word
                                    if poem_ls[
                                            idx] == word and "****" not in word and "." != word and "\n" not in word:
                                        # if "\n" in word:
                                        #     replacement_word=replacement_word+"\n"
                                        # if replacement_word=="":
                                        #     replacement_word=random.choice(RESERVOIR)
                                        poem_ls[
                                            idx] = replacement_word  #.encode('utf-8')
                                        "REPLACE", word, "with", replacement_word
                                    poem_replaced = " ".join(poem_ls)

                                    # store this word so that conjugation can be checked
                                    previous_replacement_word = replacement_word
                                except Exception, e:
                                    print "PENULTIMATE SKIP_bool replace FAIL", e
                                    SKIP_bool = True
                                    continue

                    ###########################################################################
                    # testing Pattern.en as parser for conjugation and article replacement    #
                    # much more robust than my hand-coded hacks                               #
                    ###########################################################################

                    # correct CONJUGATion of paticiple verbs with pattern.en
                    parsed = parse(poem_replaced, tags=True)
                    pre_verbal = ["'m", "'s", "'re"]
                    for idx, p in enumerate(parsed.split(" ")):
                        tok = p.split("/")[0]
                        typ = p.split("/")[1]
                        #print idx,tok,typ
                        if tok in pre_verbal:
                            #print "pre_verbal:",tok
                            next_word = parsed.split(" ")[idx + 1].split("/")

                            # try try try
                            for ix, n in enumerate(next_word):
                                next_word[ix] = re.sub(
                                    '(' +
                                    '|'.join(import_utilities.chars.keys()) +
                                    ')', import_utilities.replace_chars,
                                    n).encode('utf-8')
                            try:
                                #print  next_word,next_word[0],next_word[1][:2]
                                # if it's a verb that follows
                                if next_word[1][:2] == "VB":
                                    before_verb = " ".join(
                                        w for w in poem_replaced.split(" ")
                                        [:idx])  #.encode('utf-8')
                                    after_verb = " ".join(
                                        w for w in poem_replaced.split(" ")
                                        [idx + 1:])  #.encode('utf-8')
                                    new_verb = conjugate(
                                        next_word[0],
                                        tense=PARTICIPLE,
                                        parse=True).encode('utf-8')
                                    # insert new
                                    #print "CONJUGATION needed, changing:",poem_replaced.split(" ")[idx],"to",parsed.split(" ")[idx],poem_replaced.split(" ")[idx-1]+" "+new_verb
                                    poem_replaced = before_verb + " " + new_verb + " " + after_verb
                            except Exception, e:
                                #print "INside parsed COnjugation loop",e
                                continue

                    # correct ARTICLES
                    for idx, word in enumerate(poem_replaced.split(" ")):
                        if len(word) > 0 and idx != 0 and " " not in word:
                            # A or AN
                            if poem_replaced.split(" ")[idx - 1].lower(
                            ) == "a" or poem_replaced.split(" ")[
                                    idx - 1].lower() == "an":
                                #print word,"---",article(word)+" "+word
                                before_article = " ".join(
                                    w for w in poem_replaced.split(" ")[:idx -
                                                                        1])
                                after_article = " ".join(
                                    w for w in poem_replaced.split(" ")[idx +
                                                                        1:])
                                new_conj = referenced(word)
                                # capitalize
                                if poem_replaced.split(" ")[idx - 1].istitle():
                                    new_conj = new_conj.split(" ")[0].title(
                                    ) + " " + new_conj.split(" ")[1]
                                poem_replaced = before_article + " " + new_conj + " " + after_article

                    #########################
                    #   WRITE SINGLE POEM   #
                    #########################
                    if not SKIP_bool:

                        tmp_poem = ""

                        # poem_replaced.replace("\t","&#9;")
                        # poem_replaced.replace("\n"," <br>")
                        # poem_replaced.replace("\r"," <br>")

                        HTML_poem = ""
                        for line in poem_replaced.split("\n"):
                            #print "LINE", line
                            HTML_poem += line + "<br>"

                        if len(response) > 0 and len(id.split("_")) > 1:
                            # ALL_poems = ALL_poems_intro + " ".join(i for i in ALL_poems.split("</h2>.")[0:])+"<br><br>~~~~~~~~~~~~~~~~~~~~~~~~~~<br>[ A poem generated from template : <b>"+ author+"</b>, <i>"+ title +"</i> ]<br><br><b>"+new_title+"<br><br></b>"+HTML_poem

                            try:
                                ALL_poems = "<br>[ A  generated-poem based upon: <i>" + title + "</i> by <b>" + author + "</b>]<br><br><i>" + new_title + "</i><br> by <b>" + new_author + "</b><br>" + HTML_poem + ALL_poems.split(
                                    "</h2>")[1].replace("  ", "&nbsp")

                                tmp_poem = "[A generated-poem based upon: '" + title + "' by " + author + "]\n\n" + new_title + "\nby " + new_author + "\n" + poem_replaced

                                print "\n~~~\n\n" + tmp_poem
                                #print "\nORIGINAL:",bio

                                txt_fn = id.split("_")[1] + "_POEMs.txt"

                                WRITE_BIO_PATH = DATA_DIR + "generated/POEMS/POEMS_" + datetime.datetime.now(
                                ).strftime('%Y-%m-%d_%H') + "/"
                                if not os.path.exists(WRITE_BIO_PATH):
                                    os.makedirs(WRITE_BIO_PATH)

                                txt_fn_path = WRITE_BIO_PATH + txt_fn
                                f_txt = open(txt_fn_path, 'w')
                                f_txt.write(tmp_poem)  #.encode('utf-8'))
                                f_txt.close()
                                #print "\nTXT file created at:",txt_fn_path

                                #######
                                #   write them all.... wasteful... but useful if run is interrupted....
                                ###########

                                # if cnt==1:
                                #     ALL_poems = ALL_poems_intro+ALL_poems
                                # else:
                                ALL_poems = ALL_poems_intro + ALL_poems.replace(
                                    "  ", "&nbsp")
                                ALL_poems = ALL_poems.replace(
                                    "$$datetime$$",
                                    datetime.datetime.now().strftime(
                                        '%Y-%m-%d at %H:%M'))
                                ALL_poems = ALL_poems.replace(
                                    "$$cnt$$", str(cnt))
                                #print "cnt",cnt
                                ALL_poems = ALL_poems.replace(
                                    "$$gentime$$",
                                    str(time.time() - start_time))

                                # ALL POEMS
                                txt_fn = datetime.datetime.now().strftime(
                                    '%Y-%m-%d_%H'
                                ) + "_poetryFoundation_generatedPOEMS_" + type_of_run + ".html"
                                txt_fn_path = DATA_DIR + "generated/POEMS/" + txt_fn
                                f_txt = open(txt_fn_path, 'w')
                                f_txt.write(ALL_poems + "</hmtl>")
                                f_txt.close()
                                #print "\nTXT file created at:",txt_fn_path
                            except Exception, e:
                                print "At the final LOOP", e
                                continue

                        else:
                            print "~! EMPTY response:", author

                    else:
                        cnt = cnt - 1

Exemple #3

0

Afficher le fichier

Fichier : Alchemy_poetryFoundation_STEP3_POEMS_FEATURE_EXTRACTION_HATCHER_ELO_INPUT_RANDom.py Projet : amshenoy/Big-Data-Poetry

def extractFeaturesAndWriteBio(READ_PATH,file_type):
    
    

    global ALL_poems,bio,cnt, start_time

    inp=0
    sub_cnt=0
    words_total=0
    lines_total=0

    for subdir, dirs, files in os.walk(READ_PATH):
        for file in files:
            
            num_of_files = len(files)-1 # deduct the DS_store
            #print (num_of_files,'readDirectory',READ_PATH)
            
            if file_type in file  and 'readme' not in file:

                # ID
                id=file.split(".")[0]
                #print "\n\n*********\nID:",id

                filenames.append(id)
                cnt+=1

                # print('')
                #print('')
                # print('OPENED:',id)
                # print('')
                #print('')

                ##############
                #  HOW MANY? #
                ##############
                sub_cnt+=1
                if sub_cnt>=inp:
                    if inp != 0:
                        end_time = time.time()
                        es = end_time-start_time
                        print sub_cnt, "poems,\n",lines_total,"lines, &\n",words_total,"words \ngenerated in\n",("%.2f" % es),"seconds"
                        
                    words_total=0
                    lines_total=0

                    # RESTART

                    sub_cnt=0
                    inp = input("\n\n^^^^^^^^^^^^^^\n\nHow many poems do u want? ")
                    print "\n\n^^^^^^^^^^^^^^^"
                    start_time = time.time()

                print 'Poem #',sub_cnt+1

                poem_replaced = ""
                replacement_word = ""
                author=""
                titles=""
                title=""
                new_title=""

                replaced_ls =[]
                new_titles_ls = []
                quit_language=0

                #################################################################
                # Load  POEM TEXT FILE (based on id extracted from Alchemy JSON)    #
                #################################################################

                txt_fn_path = DATA_DIR + READ_TXT_PATH + id.split("_")[1]+".txt"
                #print "txt_fn_path:",txt_fn_path

                if os.path.isfile(txt_fn_path) and cnt>0:
                    txt_data=open(txt_fn_path).read()

                    # http://blog.webforefront.com/archives/2011/02/python_ascii_co.html
                    # txt_data.decode('ISO-8859-2') .decode('utf-8')
                    # unicode(txt_data)

                    author=txt_data.split("****!****")[0].strip(' \t\n\r')
                    
                    title=txt_data.split("****!****")[1].strip(' \t\n\r')
                    
                    bio=txt_data.split("****!****")[2]#.strip(' \t\n\r')

                    ######  CLEAN BIO
                    bio.replace("\t","&#9;")
                    bio.replace("\n"," <br>")
                    bio.replace("\r"," <br>")
                    poem_replaced=bio
                    #print poem_replaced

                    ###############################
                    # REPLACE AUTHOR NAME
                    ##############################
                    author_ln=author.split(" ")[-1]
                    author_fn=author.split(" ")[:-1]
                    #
                    #poem_replaced = poem_replaced.replace(author_ln,"Jhave")

                    #######################
                    # fake AUTHOR
                    #######################
                    
                    new_author= " ".join(random.choice(authors).split(" ")[1:-2])+" "+random.choice(authors).split(" ")[-2]
                    

                    #######################
                    # replace BOOK TITLES
                    #######################
                    #print "TITLES"]
                    new_title = getNewTitle("title").encode('utf-8')
                             

                    ############################
                    # replace years with another
                    ############################
                    for w1 in poem_replaced.split("("):
                        for w2 in w1.split(")"):
                            if w2 is not None and w2.isdigit():
                                new_num = random.randint(int(w2)-5,int(w2)+5)
                                #print "REPLACING #:",w2,new_num
                                poem_replaced = poem_replaced.replace(w2,str(new_num))
                                replaced_ls.append(new_num)                            
                                               

                    #################
                    # Load JSON     #
                    #################
                    response = loadJSONfile(READ_JSON_PATH+"poetryFoundation_"+id.split("_")[1]+"_Alchemy_JSON.txt")

                    if response != "failed":

                        if response.get('entities') is not None:
                            for idx,entity in enumerate(response['entities']):

                                #print idx
                                ce = entity['text'].replace("0xc2"," ")
                                ce = ce.replace("0xe2","'")
                                ce = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, ce)
                                ce = ce.encode('utf-8')

                                try:
                                    content = ce.decode('utf-8').encode('ascii', 'xmlcharrefreplace')
                                except UnicodeDecodeError:
                                    "AAAARGGGGHHH!!!!"

                                if content in poem_replaced:
                                                       
                                    ################################################
                                    # Replace similar entities from other JSON     #
                                    ################################################
                                    replacement_entity = findSimilarEntityinRandomJSON(content,entity['type'])

                                    cr = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, replacement_entity)

                                    poem_replaced = poem_replaced.replace(content,replacement_entity)
                                    replaced_ls.append(replacement_entity)
                    

                    ##########################
                    #   POS REPLACMENT       #
                    ##########################

                    token_tuples = nltk.word_tokenize(poem_replaced)
                    tt = nltk.pos_tag(token_tuples)

                    #################
                    #  ADJECTIVES   #
                    #################
                    for i in tt:
                        if "/i" not in i[0] and len(i[0])>3 and i[0] != "died":
                            origw =  re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, i[0])
                            origw =import_utilities.strip_punctuation(origw) 
                            if i[1]=='JJ' :
                                JJr = random.choice(JJ)
                                # # JJr =  re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, JJr)
                                # JJr = import_utilities.strip_punctuation(JJr)
                                JJr = import_utilities.moveBeginAndEndPunctuationFromStrToString(i[0],JJr.lstrip().lstrip())
                                
                                if i[0].istitle():
                                    JJr = JJr.title()

                                poem_replaced = re.sub(r'\b' + import_utilities.strip_punctuation(i[0]) + r'\b', JJr, poem_replaced,1)#poem_replaced.replace(i[0],JJr,1)
                                replaced_ls.append(JJr)
                            if i[1]=='RB':
                                RBr = random.choice(RB)
                                RBr = import_utilities.moveBeginAndEndPunctuationFromStrToString(i[0],RBr.lstrip().lstrip())

                                if i[0].istitle():
                                    RBr = RBr.title()
                                poem_replaced = re.sub(r'\b' + import_utilities.strip_punctuation(i[0])  + r'\b', RBr, poem_replaced,1)
                                replaced_ls.append(RBr)


                    ########################
                    # IS IT ENGLISH?       #
                    ########################
                    for line  in poem_replaced.split('\n\r'):

                        

                        if len(line)>0 :
                            if "english" not in import_utilities.get_language(line):
                                quit_language+=1
                                #print "NOT english:",quit_language,line
                            else:
                                quit_language-=1

                    
                    #########################
                    #   SYNSET REPLACE      #
                    #########################
                    for idx,word in enumerate(poem_replaced.split(' ')):


                        

                        if "<br>" not in word and "&#9;" not in word and len(word)>0 and "~~~~!~~~" not in word:


                            words_total+=1

                            #########################
                            #   PRONOUN ' VERB      #
                            #########################
                            if len(word.split("'"))>1:
                                if word.split("'")[0] in personal_pronouns:
                                    replacement_word = random.choice(personal_pronouns)+"'"+word.split("'")[1]+' '
                                poem_replaced.replace(word,replacement_word)             
                                #print "word,",word,"replacement_word:",replacement_word
                           
                            ####################################################
                            # Replacement of OTHERs                            #
                            ####################################################

                            elif not word.lower().strip(" \n\t\r") in stopwords.words('english'):

                                # take off leading brackets, commas etc...
                                word_punct_nopunct = import_utilities.strip_punctuation_bool(word)
                                word_nopunct = word_punct_nopunct['word'].strip(" \n\t\r")
                                word_punct = word_punct_nopunct['punct']
                                punct_bool = word_punct_nopunct['punct_bool']

                             

                                #######################################################
                                # MAIN EXCHANGE PROCESS CALL >>>>>>>   GET THE SYNSET #
                                #######################################################    
                                if word_nopunct[-4:].lower()=="here":
                                    similarterm=random.choice(import_utilities.heres)
                                else:
                                    #print "WORD:",word_nopunct
                                    if len(word_nopunct)>2:
                                        similarterm = import_utilities.find_synset_word(word_nopunct)#(word.lstrip().rstrip())

                                
                                ############################################
                                # manually get rid of some terrible choices
                                ############################################
                                if similarterm == "ilk":
                                    ##print "like"
                                    similarterm = "like"
                                if similarterm == "ope":
                                    ##print "doth"
                                    similarterm = "does"
                                if similarterm == "information technology":
                                    ##print "doth"
                                    similarterm = "it"

                                #######################################                      
                                # abbreviations for f*****g states!   #
                                #######################################
                                if word_nopunct.upper() in import_utilities.state_abbrev and word_nopunct.lower() not in stopwords.words('english') and "me," not in word:
                                    tmp = similarterm
                                    if word_nopunct == "oh": 
                                        similarterm = random.choice(import_utilities.exclaims)
                                    else:
                                        similarterm = random.choice(RESERVOIR)
                                    #print word_nopunct," replaced by", tmp, "replaced with:",similarterm, "in:",line

                                ##############
                                # hyphenated #
                                ##############
                                hyp =word.split("-")
                                #print word,len(hyp)
                                if len(hyp) >1:
                                    similarterm=""
                                    for w in hyp:
                                        if len(w) > 2:
                                            similarterm +=  import_utilities.find_synset_word(w)+"-"
                                    similarterm = import_utilities.strip_underscore(similarterm[:-1])
                                    #print "hyphenated:",word,"replaced by: "+similarterm
                                        


                                
                                #########################################################    
                                # is it a TRUNCATED VERB slang as in singin or wishin   #
                                #########################################################
                                if similarterm == word_nopunct and len(word)>2 and 'in' in word_nopunct[-2:]:
                                    similarterm = import_utilities.find_synset_word(word_nopunct+'g')
                                    ## #print "TRUNCATED SLANG word: '"+word+"'",similarterm
                                    interim = import_utilities.lemma(similarterm)
                                    ## #print interim
                                    similarterm = import_utilities.conjugate(interim, tense=import_utilities.PARTICIPLE, parse=True)[:-1] 
                                    # # # #print word,"widx:",widx," line_pos_tags[widx][0]:",line_pos_tags[widx][0]," line_pos_tags[widx][1]:",line_pos_tags[widx][1]
                                   

                                #################      
                                # SWEAR WORD    #
                                #################
                                ##print "at the garden of if:", word
                                if word_nopunct in import_utilities.curses:
                                    similarterm = random.choice(import_utilities.curses)
                                    ##print "SWEAR WORD word: '"+word+"'",similarterm


                                if len(hyp) >1:
                                    replacement_word = similarterm
                                else:
                                    replacement_word = word.replace(word_nopunct, similarterm)
                                    replacement_word = import_utilities.strip_underscore(replacement_word)
                                    replacement_word = import_utilities.replaceNumbers(replacement_word)

                                #########################
                                # RESERVOIR_OF_WEIRDNESS  #
                                #########################  

                                if word_nopunct.lower() in import_utilities.impera:
                                    replacement_word=random.choice(import_utilities.impera)
                                    #print word,"IMPERA:",replacement_word
                                elif word_nopunct.lower() in import_utilities.conjuncts:
                                    replacement_word=random.choice(import_utilities.conjuncts)
                                    #print word," CONJUNCTION replaced with",replacement_word
                                elif word_nopunct.lower() in import_utilities.indef_prono:
                                    replacement_word=random.choice(import_utilities.indef_prono)
                                    #print word," INDEF_prono replaced with",replacement_word
                                elif word_nopunct.lower() in import_utilities.prepo:
                                    replacement_word=random.choice(import_utilities.prepo)
                                    #print word," prepo replaced with",replacement_word
                                elif word_nopunct.lower() in import_utilities.rel_prono:
                                    replacement_word=word
                                    #print word," rel_prono LEAVE alone: ",replacement_word
                                elif word_nopunct.lower()[-2:] =="ly":
                                    replacement_word=import_utilities.strip_underscore(import_utilities.find_synset_word(word))#(word[:-2])
                                    #print word," ADVERB: ",replacement_word
                                    # if replacement_word[-2:] !="ly":
                                    #     replacement_word +="ly"
                                                                            
                                else:
                                    if len(hyp) <2 and "like" not in word_nopunct and import_utilities.singularize(word_nopunct) ==  import_utilities.singularize(replacement_word) and word_nopunct.lower() not in import_utilities.stopwords_ls:

                                        if word_nopunct not in RESERVOIR and quit_language<0 and import_utilities.countPunctuation(word)<1 and len(word_nopunct)>3 and not word_nopunct.istitle(): 
                                            
                                            #print "ADDING",word,"to reservoir"
                                            RESERVOIR.append(word)
                                            
                                            replacement_word = random.choice(RESERVOIR)
                                            #print word_nopunct,"replaced from reservoir with", replacement_word
                                       # print "'"+word_nopunct+"'  vs RESERVOIR  replacement_word:",replacement_word #,"    new_line:",new_line
                                if quit_language>1 and not word_nopunct.istitle():
                                    #print quit_language, "Probably foreign language: make a word salad in english"
                                    replacement_word = random.choice(RESERVOIR)
                                    #print word_nopunct,"OTHER replaced from reservoir with", replacement_word
                                

                                # REPLACEMENT
                                poem_ls = poem_replaced.split(' ')
                                idx =  poem_ls.index(word)


                                # #print idx,",", poem_ls[idx],",", word ,",",replacement_word

                                if poem_ls[idx]==word:
                                    poem_ls[idx]=replacement_word
                                poem_replaced = " ".join(poem_ls)


                                #poem_replaced = poem_replaced.replace(word,replacement_word)



                    # CORRECT the "A" to "An"    
                    for idx,word in enumerate(poem_replaced.split(" ")):
                        # poem_replaced = poem_replaced+"A organism"
                        if len(word)>0 and word[0].lower() in the_vowels and poem_replaced.split(" ")[idx-1].lower() =="a" :      
                                if poem_replaced.split(" ")[idx-1] =="a":
                                    old_str = "a "+poem_replaced.split(" ")[idx]    
                                    new_str = "an "+poem_replaced.split(" ")[idx]
                                else:
                                    old_str = "A "+poem_replaced.split(" ")[idx]    
                                    new_str = "An "+poem_replaced.split(" ")[idx]
                                poem_replaced = poem_replaced.replace(old_str,new_str)

                        # poem_replaced = poem_replaced+"An consonant"
                        if len(word)>0 and word[0].lower() not in the_vowels and poem_replaced.split(" ")[idx-1].lower() =="an" :      
                                if poem_replaced.split(" ")[idx-1] =="an":
                                    old_str = "an "+poem_replaced.split(" ")[idx]    
                                    new_str = "a "+poem_replaced.split(" ")[idx]
                                else:
                                    old_str = "An "+poem_replaced.split(" ")[idx]    
                                    new_str = "A "+poem_replaced.split(" ")[idx]
                                poem_replaced = poem_replaced.replace(old_str,new_str)
                                #print "FOUND correction needed",old_str,new_str


                    #########################
                    #   WRITE SINGLE POEM   #
                    #########################
                    tmp_poem=""   

                    # poem_replaced.replace("\t","&#9;")
                    # poem_replaced.replace("\n"," <br>")
                    # poem_replaced.replace("\r"," <br>")

                    HTML_poem=""
                    for line in poem_replaced.split("\n"):
                        lines_total+=1
                        #print "LINE", line
                        HTML_poem += line+"<br>"

                    if len(response) >0 and len(id.split("_"))>1:
                        # ALL_poems = ALL_poems_intro + " ".join(i for i in ALL_poems.split("</h2>.")[0:])+"<br><br>~~~~~~~~~~~~~~~~~~~~~~~~~~<br>[ A poem generated from template : <b>"+ author+"</b>, <i>"+ title +"</i> ]<br><br><b>"+new_title+"<br><br></b>"+HTML_poem
                        ALL_poems += "<br><br>~~~~~~~~~~~~~~~~~~~~~~~~~~<br>[ A poem generated from template : <b>"+ author+"</b>, <i>"+ title +"</i> ]<br><br><b>"+new_title+"<br><br></b>"+HTML_poem

                        tmp_poem= "[A poem generated from template: "+ author+", '"+ title +"'']\n\n'"+new_title+"'\nby\n"+new_author+"\n\n"+poem_replaced

                        print "\n******\n"+tmp_poem
                        #print "\nORIGINAL:",bio
                

                        txt_fn = id.split("_")[1]+"_POEMs.txt"

                        # WRITE_BIO_PATH = DATA_DIR+"generated/POEMS/POEMS_"+datetime.datetime.now().strftime('%Y-%m-%d_%H')+"/"
                        # if not os.path.exists(WRITE_BIO_PATH):
                        #         os.makedirs(WRITE_BIO_PATH)

                        txt_fn_path = GENERATED_DIR+txt_fn
                        f_txt=open(txt_fn_path,'w')
                        f_txt.write(tmp_poem)#.encode('utf-8'))       
                        f_txt.close();   
                        #print "\nTXT file created at:",txt_fn_path

                        
                        # #######
                        # #   write them all.... wasteful... but useful if run is interrupted....
                        # ###########    
                        # ALL_poems = ALL_poems.replace("$$datetime$$",datetime.datetime.now().strftime('%Y-%m-%d at %H:%M'))
                        # ALL_poems = ALL_poems.replace("$$cnt$$",str(cnt))
                        # print "cnt",cnt
                        # ALL_poems = ALL_poems.replace("$$gentime$$",str(time.time() - start_time))

                        # # ALL POEMS
                        # txt_fn = datetime.datetime.now().strftime('%Y-%m-%d_%H')+"_poetryFoundation_generatedPOEMS_"+type_of_run+".html"
                        # txt_fn_path = DATA_DIR+"generated/POEMS/"+txt_fn
                        # f_txt=open(txt_fn_path,'w')
                        # f_txt.write(ALL_poems+"</hmtl>")       
                        # f_txt.close();   
                        # print "\nTXT file created at:",txt_fn_path





                    else:
                        "~~~~~~~~~~~~~~~~!!!!!!!!!! EMPTY response:", author

Exemple #4

0

Afficher le fichier

Fichier : Alchemy_poetryFoundation_STEP3_POEMS_CREELEY_SLOWER.py Projet : amshenoy/Big-Data-Poetry

def extractFeaturesAndWriteBio(READ_PATH,file_type):
    
    

    global ALL_poems,bio,cnt

    for subdir, dirs, files in os.walk(READ_PATH):
        for file in files:
            
            num_of_files = len(files)-1 # deduct the DS_store
            #print (num_of_files,'readDirectory',READ_PATH)
            
            if file_type in file  and 'readme' not in file:

                # ID
                id=file.split(".")[0]
                #print "\nID:",id.split("_")[1]

                filenames.append(id)
                cnt+=1

                # print('')
                # print('')
                # print('OPENED:',id)
                # print('')
                # print('')

                poem_replaced = ""
                replacement_word = ""
                previous_replacement_word = ""
                
                author=""
                titles=""
                title=""
                new_title=""

                replaced_ls =[]
                new_titles_ls = []
                quit_language=0
                oscillator=0

                word_cnt=0

                # if EXCEPTION is raised... do not add to html
                SKIP_bool=False

                ##########################
                # Load  POEM TEXT FILE     #
                ##########################

                ##
                # PAUSE
                ##
                #time.sleep(5)

                txt_fn_path = DATA_DIR + READ_TXT_PATH + id.split("_")[1]+".txt"
                #print "txt_fn_path:",txt_fn_path

                if os.path.isfile(txt_fn_path) and cnt>0:
                    txt_data=open(txt_fn_path).read()

                    # http://blog.webforefront.com/archives/2011/02/python_ascii_co.html
                    # txt_data.decode('ISO-8859-2') .decode('utf-8')
                    # unicode(txt_data)

                    author=txt_data.split("****!****")[0].strip(' \t\n\r')
                    
                    title=txt_data.split("****!****")[1].strip(' \t\n\r')
                    
                    bio=txt_data.split("****!****")[2]#.strip(' \t\n\r')

                    ######  CLEAN BIO
                    bio.replace("\t","&#9;")
                    bio.replace("\n"," <br>")
                    bio.replace("\r"," <br>")
                    poem_replaced=bio
                    #print poem_replaced

                    ###############################
                    # REPLACE AUTHOR NAME in poem
                    ##############################
                    author_ln=author.split(" ")[-1].lstrip()
                    author_fn=author.split(" ")[:-1]
                    author = " ".join(n for n in author_fn)+author_ln
                    #
                    #poem_replaced = poem_replaced.replace(author_ln,"Jhave")

                    #######################
                    # replace BOOK TITLES
                    #######################
                    #print "TITLES"]
                    new_title = getNewTitle("title").encode('utf-8')

                    #######################
                    # fake AUTHOR
                    #######################
                    
                    new_author= " ".join(random.choice(authors).split(" ")[1:-2])+" "+random.choice(authors).split(" ")[-2]
                    #print "new AUTHOR",new_author                           

                    ############################
                    # replace years with another
                    ############################
                    for w1 in poem_replaced.split("("):
                        for w2 in w1.split(")"):
                            if w2 is not None and w2.isdigit():
                                new_num = random.randint(int(w2)-5,int(w2)+5)
                                #print "REPLACING #:",w2,new_num
                                poem_replaced = poem_replaced.replace(w2,str(new_num))
                                replaced_ls.append(new_num)                            
                                               

                    #################
                    # Load JSON     #
                    #################
                    response = loadJSONfile(READ_JSON_PATH+"poetryFoundation_"+id.split("_")[1]+"_Alchemy_JSON.txt")

                    if response != "failed":

                        if response.get('entities') is not None:
                            for idx,entity in enumerate(response['entities']):

                                #print idx
                                ce = entity['text'].replace("0xc2"," ")
                                ce = ce.replace("0xe2","'")
                                ce = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, ce)
                                ce = ce.encode('utf-8')

                                try:
                                    content = ce.decode('utf-8').encode('ascii', 'xmlcharrefreplace')
                                except UnicodeDecodeError:
                                    "AAAARGGGGHHH!!!!"

                                if content in poem_replaced:
                                                       
                                    ################################################
                                    # Replace similar entities from other JSON     #
                                    ################################################
                                    replacement_entity = findSimilarEntityinRandomJSON(content,entity['type'])

                                    cr = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, replacement_entity)

                                    poem_replaced = poem_replaced.replace(content,replacement_entity)
                                    replaced_ls.append(replacement_entity)
                    

                    ##########################
                    #   POS REPLACMENT       #
                    ##########################

                    token_tuples = nltk.word_tokenize(poem_replaced)
                    tt = nltk.pos_tag(token_tuples)

                    #################
                    #  ADJECTIVES   #
                    #################
                    for i in tt:
                        if "/i" not in i[0] and len(i[0])>3 and i[0] != "died":
                            origw =  re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, i[0])
                            origw =import_utilities.strip_punctuation(origw) 
                            if i[1]=='JJ' :
                                JJr = random.choice(JJ)
                                # # JJr =  re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, JJr)
                                # JJr = import_utilities.strip_punctuation(JJr)
                                JJr = import_utilities.moveBeginAndEndPunctuationFromStrToString(i[0],JJr.lstrip().lstrip())
                                
                                if i[0].istitle():
                                    JJr = JJr.title()

                                poem_replaced = re.sub(r'\b' + import_utilities.strip_punctuation(i[0]) + r'\b', JJr, poem_replaced,1)#poem_replaced.replace(i[0],JJr,1)
                                replaced_ls.append(JJr)
                            if i[1]=='RB':
                                RBr = random.choice(RB)
                                RBr = import_utilities.moveBeginAndEndPunctuationFromStrToString(i[0],RBr.lstrip().lstrip())

                                if i[0].istitle():
                                    RBr = RBr.title()
                                poem_replaced = re.sub(r'\b' + import_utilities.strip_punctuation(i[0])  + r'\b', RBr, poem_replaced,1)
                                replaced_ls.append(RBr)


                    ########################
                    # IS IT ENGLISH?       #
                    ########################
                    for line  in poem_replaced.split('\n\r'):
                        if len(line)>0 :
                            if "english" not in import_utilities.get_language(line):
                                quit_language+=1
                                #print "NOT english:",quit_language,line
                            else:
                                quit_language-=1

                    
                    #########################
                    #   SYNSET REPLACE      #
                    #########################
                    for idx,word in enumerate(poem_replaced.split(' ')):




                        if "<br>" not in word and "&#9;" not in word and len(word)>0:




                            #########################
                            #   PRONOUN ' VERB      #
                            #########################
                            if len(word.split("'"))>1:
                                if word.split("'")[0] in personal_pronouns:
                                    replacement_word = random.choice(personal_pronouns)+"'"+word.split("'")[1]+' '
                                poem_replaced.replace(word,replacement_word)             
                                #print "word,",word,"replacement_word:",replacement_word
                           
                            ####################################################
                            # Replacement of OTHERs                            #
                            ####################################################

                            elif not word.lower().strip(" \n\t\r") in stopwords.words('english'):

                                # take off leading brackets, commas etc...
                                word_punct_nopunct = import_utilities.strip_punctuation_bool(word)
                                word_nopunct = word_punct_nopunct['word'].strip(" \n\t\r")
                                word_punct = word_punct_nopunct['punct']
                                punct_bool = word_punct_nopunct['punct_bool']

                             

                                #######################################################
                                # MAIN EXCHANGE PROCESS CALL >>>>>>>   GET THE SYNSET #
                                #######################################################    
                                if word_nopunct[-4:].lower()=="here":
                                    similarterm=random.choice(import_utilities.heres)
                                else:
                                    #print "WORD:",word_nopunct
                                    if len(word_nopunct)>3:

                                        oscillator  = oscillator+1
                                        
                                        ############################################
                                        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                        # STYLE SWITCH..... should in future use POS
                                        # ... i.e. if noun & oscillator%3, do...
                                        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                        ############################################
                                        # synset
                                        similarterm = import_utilities.synset_creeley(word_nopunct)
                                        #print "synset", similarterm

                                        if similarterm is not None and similarterm == word_nopunct and len(word_nopunct)>5:
                                            RESERVOIR.sort(key=len)
                                            similarterm= RESERVOIR[idx%len(RESERVOIR)]
                                            #print idx,len(RESERVOIR),similarterm,word_nopunct,"PRE>>>>>>>>LAST CHANGE STOP: ", word, "~",similarterm

                                            
                                                          
                                ############################################
                                # manually get rid of some terrible choices
                                ############################################
                                naw_terms=["mind","lonely"]
                                if similarterm == "ilk":
                                    ##print "like"
                                    similarterm = "like"
                                if similarterm == "Nox":
                                    ##print "like"
                                    similarterm = "nite"
                                if similarterm == "ope":
                                    ##print "doth"
                                    similarterm = "does"
                                if similarterm == "information technology":
                                    ##print "doth"
                                    similarterm = "it"
                                if similarterm == "velleity":
                                    ##print "doth"
                                    similarterm = "want"
                                if similarterm == "Crataegus laevigata":
                                    ##print "doth"
                                    similarterm = "may"
                                if similarterm == "eff":
                                    ##print "doth"
                                    similarterm = "know"
                                if similarterm == "naw":
                                    ##print "doth"
                                    similarterm = "mind"
                                #######################################                      
                                # abbreviations for f*****g states!   #
                                #######################################
                                if word_nopunct.upper() in import_utilities.state_abbrev and word_nopunct.lower() not in stopwords.words('english') and "me," not in word:
                                    tmp = similarterm
                                    if word_nopunct == "oh": 
                                        similarterm = random.choice(import_utilities.exclaims)
                                    else:

                                        similarterm = random.choice(RESERVOIR)
                                    #print word_nopunct," replaced by", tmp, "replaced with:",similarterm, "in:",line

                                ##############
                                # hyphenated #
                                ##############
                                hyp =word.split("-")
                                #print word,len(hyp)
                                if len(hyp) >1:
                                    similarterm=""
                                    for w in hyp:
                                        if len(w) > 2:
                                            if import_utilities.synset_creeley(w) is not None:
                                                similarterm +=  import_utilities.synset_creeley(w)+"-"
                                            else:
                                                similarterm += w+"-"
                                    similarterm = import_utilities.strip_underscore(similarterm[:-1])
                                    #print "hyphenated:",word,"replaced by: "+similarterm
                                        


                                
                                # #########################################################    
                                # # is it a TRUNCATED VERB slang as in singin or wishin   #
                                # #########################################################
                                # if similarterm == word_nopunct and len(word)>2 and 'in' in word_nopunct[-2:]:
                                #     similarterm = import_utilities.synset_creeley(word_nopunct+'g')
                                #     ## #print "TRUNCATED SLANG word: '"+word+"'",similarterm
                                #     interim = import_utilities.lemma(similarterm)
                                #     ## #print interim
                                #     similarterm = import_utilities.conjugate(interim, tense=import_utilities.PARTICIPLE, parse=True)[:-1] 
                                #     # # # #print word,"widx:",widx," line_pos_tags[widx][0]:",line_pos_tags[widx][0]," line_pos_tags[widx][1]:",line_pos_tags[widx][1]
                                   

                                #################      
                                # SWEAR WORD    #
                                #################
                                ##print "at the garden of if:", word
                                if word_nopunct in import_utilities.curses:
                                    similarterm = random.choice(import_utilities.curses)
                                    ##print "SWEAR WORD word: '"+word+"'",similarterm


                                #print "SIMILAR:",similarterm

                                if similarterm is not None:
                                    if len(hyp) >1:
                                        replacement_word = similarterm
                                    else:
                                        replacement_word = word.replace(word_nopunct, similarterm)
                                        replacement_word = import_utilities.strip_underscore(replacement_word)
                                        replacement_word = import_utilities.replaceNumbers(replacement_word)

                                #########################
                                # RESERVOIR_OF_WEIRDNESS  #
                                #########################  

                                if word_nopunct.lower() in import_utilities.impera:
                                    replacement_word=random.choice(import_utilities.impera)
                                    #print word,"IMPERA:",replacement_word
                                elif word_nopunct.lower() in import_utilities.conjuncts:
                                    replacement_word=random.choice(import_utilities.conjuncts)
                                    #print word," CONJUNCTION replaced with",replacement_word
                                elif word_nopunct.lower() in import_utilities.indef_prono:
                                    replacement_word=random.choice(import_utilities.indef_prono)
                                    #print word," INDEF_prono replaced with",replacement_word
                                elif word_nopunct.lower() in import_utilities.prepo:
                                    replacement_word=random.choice(import_utilities.prepo)
                                    #print word," prepo replaced with",replacement_word
                                elif word_nopunct.lower() in import_utilities.rel_prono:
                                    replacement_word=word
                                    #print word," rel_prono LEAVE alone: ",replacement_word
                                elif word_nopunct.lower()[-2:] =="ly":
                                    if import_utilities.synset_creeley(word) is not None:
                                        replacement_word=import_utilities.strip_underscore(import_utilities.synset_creeley(word))#(word[:-2])
                                    #print word," ADVERB: ",replacement_word
                                    # if replacement_word[-2:] !="ly":
                                    #     replacement_word +="ly"
                                                                            
                                else:
                                    if len(hyp) <2 and "like" not in word_nopunct and import_utilities.singularize(word_nopunct) ==  import_utilities.singularize(replacement_word) and word_nopunct.lower() not in import_utilities.stopwords_ls:

                                        if word not in RESERVOIR and quit_language<0 and import_utilities.countPunctuation(word)<1 and len(word_nopunct)>3 and not word_nopunct.istitle(): 
                                            
                                            #print "ADDING",word,"to reservoir"
                                            ############################
                                            # ADDING ONLY SMALL WORDS
                                            ############################
                                            if len(word)<7:
                                                RESERVOIR.append(word)
                                            
                                            replacement_word = random.choice(rap_mouth)# RESERVOIR)
                                            #print word_nopunct,"replaced from reservoir with", replacement_word
                                       # print "'"+word_nopunct+"'  vs RESERVOIR  replacement_word:",replacement_word #,"    new_line:",new_line
                                if quit_language>1 and not word_nopunct.istitle():
                                    #print quit_language, "Probably foreign language: make a word salad in english"
                                    replacement_word = random.choice(rap_mouth)#RESERVOIR)
                                    #print word_nopunct,"OTHER replaced from reservoir with", replacement_word
                                
                                ###################################################
                                # MOST REPLACEMENT occurs here...                 #
                                ###################################################
                                poem_ls = poem_replaced.split(' ')
                                idx =  poem_ls.index(word)

                                # print idx,",", poem_ls[idx],",", word ,",",replacement_word
                                #print word ," --- ",previous_replacement_word,replacement_word
                                
                                if len(word)>3 and replacement_word.lstrip().rstrip() == word_nopunct.lstrip().rstrip():
                                    # try alchemy?

                                    # a 
                                    RESERVOIR.sort(key=len)
                                    replacement_word = RESERVOIR[idx%len(RESERVOIR)]
                                    #print idx,len(RESERVOIR),"LAST CHANGE STOP: ", word, "~",replacement_word

                                try:

                                    if poem_ls[idx]==word and "****" not in word and "." != word and "\n" not in word:
                                        poem_ls[idx]=replacement_word#.encode('utf-8')
                                    poem_replaced = " ".join(poem_ls)

                                    # store this word so that conjugation can be checked 
                                    previous_replacement_word=replacement_word
                                except Exception, e:
                                    #print "PENULTIMATE SKIP_bool replace FAIL",e
                                    SKIP_bool=True
                                    continue

                    ###########################################################################
                    # testing Pattern.en as parser for conjugation and article replacement    #
                    # much more robust than my hand-coded hacks                               #        
                    ###########################################################################
                    
                    # correct CONJUGATion of paticiple verbs with pattern.en
                    parsed = parse(poem_replaced,tags = True) 
                    pre_verbal = ["'m","'s","'re"]
                    for idx,p in enumerate(parsed.split(" ")):
                        tok =p.split("/")[0]
                        typ=p.split("/")[1]
                        #print idx,tok,typ
                        if tok in pre_verbal:
                            #print "pre_verbal:",tok
                            next_word= parsed.split(" ")[idx+1].split("/")

                            # try try try
                            for ix,n in enumerate(next_word): 
                                next_word[ix] = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, n).encode('utf-8')
                            try:
                                #print  next_word,next_word[0],next_word[1][:2]
                                # if it's a verb that follows
                                if next_word[1][:2] =="VB":
                                    before_verb = " ".join(w for w in poem_replaced.split(" ")[:idx])#.encode('utf-8')
                                    after_verb = " ".join(w for w in poem_replaced.split(" ")[idx+1:])#.encode('utf-8') 
                                    new_verb = conjugate(next_word[0], tense=PARTICIPLE, parse=True).encode('utf-8')
                                    # insert new
                                    #print "CONJUGATION needed, changing:",poem_replaced.split(" ")[idx],"to",parsed.split(" ")[idx],poem_replaced.split(" ")[idx-1]+" "+new_verb
                                    poem_replaced = before_verb+" "+new_verb+" "+after_verb
                            except Exception, e:
                                # print "INside parsed COnjugation loop",e
                                continue


                    # correct ARTICLES
                    for idx,word in enumerate(poem_replaced.split(" ")):
                        if len(word)>0 and idx != 0 and " " not in word:
                            # A or AN
                            if poem_replaced.split(" ")[idx-1].lower() =="a" or poem_replaced.split(" ")[idx-1].lower() =="an":
                                #print word,"---",article(word)+" "+word
                                before_article = " ".join(w for w in poem_replaced.split(" ")[:idx-1])
                                after_article = " ".join(w for w in poem_replaced.split(" ")[idx+1:])
                                new_conj = referenced(word)
                                # capitalize
                                if poem_replaced.split(" ")[idx-1].istitle():
                                    new_conj = new_conj.split(" ")[0].title()+" "+new_conj.split(" ")[1]
                                poem_replaced = before_article+" "+new_conj+" "+after_article


                    #########################
                    #   WRITE SINGLE POEM   #
                    #########################
                    if not SKIP_bool:

                        tmp_poem=""   

                        # poem_replaced.replace("\t","&#9;")
                        # poem_replaced.replace("\n"," <br>")
                        # poem_replaced.replace("\r"," <br>")

                        HTML_poem=""
                        for line in poem_replaced.split("\n"):
                            #print "LINE", line
                            HTML_poem += line+"<br>"

                        if len(response) >0 and len(id.split("_"))>1:
                            # ALL_poems = ALL_poems_intro + " ".join(i for i in ALL_poems.split("</h2>.")[0:])+"<br><br>~~~~~~~~~~~~~~~~~~~~~~~~~~<br>[ A poem generated from template : <b>"+ author+"</b>, <i>"+ title +"</i> ]<br><br><b>"+new_title+"<br><br></b>"+HTML_poem

                    # try:
                            ALL_poems = "<br>[ A  generated-poem based upon: <i>"+ title +"</i> by <b>"+ author+"</b>]<br><br><i>"+new_title+"</i><br> by <b>"+ new_author   +"</b><br>"+HTML_poem+ALL_poems.split("</h2>")[1].replace("  ","&nbsp")

                            tmp_poem= "[A generated-poem based upon: '"+ title+"' by "+ author +"]\n\n"+new_title+ "\nby "+new_author+"\n"+poem_replaced

                            print "\n~~~\n"  #+tmp_poem
                            # SLOW TYPEWRITER PRESENTATION
                            for line in tmp_poem:
                               for c in line:
                                    time.sleep(0.04)
                                    sys.stdout.write(c)#(c.encode("utf8"))
                                    sys.stdout.flush()
# 
                            #sys.stdout.write("\n")

                            txt_fn = id.split("_")[1]+"_POEMs.txt"

                            WRITE_BIO_PATH = DATA_DIR+"generated/POEMS/POEMS_"+datetime.datetime.now().strftime('%Y-%m-%d_%H')+"/"
                            if not os.path.exists(WRITE_BIO_PATH):
                                    os.makedirs(WRITE_BIO_PATH)

                            txt_fn_path = WRITE_BIO_PATH+txt_fn
                            f_txt=open(txt_fn_path,'w')
                            f_txt.write(tmp_poem)#.encode('utf-8'))       
                            f_txt.close();   
                            #print "\nTXT file created at:",txt_fn_path

                            
                            #######
                            #   write them all.... wasteful... but useful if run is interrupted....
                            ###########  

                            # if cnt==1:
                            #     ALL_poems = ALL_poems_intro+ALL_poems
                            # else:
                            ALL_poems = ALL_poems_intro+ALL_poems.replace("  ","&nbsp")
                            ALL_poems = ALL_poems.replace("$$datetime$$",datetime.datetime.now().strftime('%Y-%m-%d at %H:%M'))
                            ALL_poems = ALL_poems.replace("$$cnt$$",str(cnt))
                            #print "cnt",cnt
                            ALL_poems = ALL_poems.replace("$$gentime$$",str(time.time() - start_time))

                            # ALL POEMS
                            txt_fn = datetime.datetime.now().strftime('%Y-%m-%d_%H')+"_poetryFoundation_generatedPOEMS_CREELEYstyle_"+type_of_run+".html"
                            txt_fn_path = DATA_DIR+"generated/POEMS/"+txt_fn
                            f_txt=open(txt_fn_path,'w')
                            f_txt.write(ALL_poems+"</hmtl>")       
                            f_txt.close();   
                            #print "\nTXT file created at:",txt_fn_path
                        # except Exception, e:
                        #         print "At the final LOOP",e
                        #         #continue
                        #         pass


                        else:
                            pass
                            #print "~! EMPTY response:", author

                    else:
                        cnt = cnt-1

Exemple #5

0

Afficher le fichier

def extractFeaturesAndWriteBio(READ_PATH, file_type):

    cnt = 0

    global ALL_bios, bio

    for subdir, dirs, files in os.walk(READ_PATH):
        for file in files:

            num_of_files = len(files) - 1  # deduct the DS_store
            #print (num_of_files,'readDirectory',READ_PATH)

            if file_type in file and 'readme' not in file:

                # ID
                id = file.split(".")[0]
                print "\n\n*********\nID:", id

                filenames.append(id)
                cnt += 1

                # print('')
                # print('')
                # print('OPENED:',id)
                # print('')
                # print('')

                bio_replaced = ""
                author = ""
                titles = ""

                replaced_ls = []
                new_titles_ls = []

                ##########################
                # Load BIO TEXT FILE     #
                ##########################

                txt_fn_path = DATA_DIR + READ_TXT_PATH + id.split(
                    "_")[1] + ".txt"
                #print "txt_fn_path:",txt_fn_path

                txt_data = open(txt_fn_path).read()

                # http://blog.webforefront.com/archives/2011/02/python_ascii_co.html
                # txt_data.decode('ISO-8859-2') .decode('utf-8')
                # unicode(txt_data)

                author = txt_data.split("****!****")[0].strip(' \t\n\r')

                titles = txt_data.split("****!****")[1].strip(' \t\n\r').split(
                    " !~*~! ")
                titles[-1] = titles[-1].strip(" !~*~!")
                bio = txt_data.split("****!****")[2].strip(' \t\n\r')

                ######  CLEAN BIO
                #bio =  re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, bio)

                ###############################
                # REPLACE AUTHOR NAME
                ##############################
                author_ln = author.split(" ")[-1]
                author_ln = author.split(" ")[-1]
                author_fn = author.split(" ")[:-1]
                #
                for word in bio.split(" "):
                    if author in word:
                        word = word.replace(author, "David Jhave Johnston")
                    if author_ln in word:
                        word = word.replace(author_ln, "Jhave")

                    bio_replaced += word + " "
                    wrong_name = " ".join(author_fn) + " Jhave"
                bio_replaced = bio_replaced.replace(wrong_name, "Jhave")
                replaced_ls.append(author)

                #######################
                # replace BOOK TITLES
                #######################
                #print "TITLES"]
                for t in titles:
                    if (t.count(" ") > len(t) / 3) or (len(t) < 5
                                                       and t.count(" ") > 3):
                        titles.remove(t)
                        #print "removing '"+t+"'"
                if len(titles) > 1:
                    for t in titles:
                        if t in bio_replaced and t != "." and t != " " and t != "  " and len(
                                t) > 3:
                            nt = getNewTitle(t).encode('utf-8')

                            #bio_replaced = re.sub(r'\b' +t  + r'\b', "<i>"+nt+"</i>", bio_replaced)

                            nti = "<i> " + nt + " </i>"
                            bio_replaced = bio_replaced.replace(t, nti)
                            #print t, nti
                            replaced_ls.append(nt)
                            new_titles_ls.append(nt)

                #######################
                # replace She with He
                #######################
                # bio_replaced = bio_replaced.replace("She ","He ")
                # bio_replaced = bio_replaced.replace(" she"," he ")
                # bio_replaced = bio_replaced.replace("Her ","His ")
                # bio_replaced = bio_replaced.replace(" her "," his ")
                # bio_replaced = bio_replaced.replace(" husband"," wife")

                ############################
                # replace years with another
                ############################
                for w1 in bio_replaced.split("("):
                    for w2 in w1.split(")"):
                        if w2 is not None and w2.isdigit():
                            new_num = random.randint(int(w2) - 5, int(w2) + 5)
                            #print "REPLACING #:",w2,new_num
                            bio_replaced = bio_replaced.replace(
                                w2, str(new_num))
                            replaced_ls.append(new_num)

                #################
                # Load JSON     #
                #################
                response = loadJSONfile(READ_JSON_PATH + "poetryFoundation_" +
                                        id.split("_")[1] + "_Alchemy_JSON.txt")

                if response != "failed":

                    if response.get('entities') is not None:
                        for idx, entity in enumerate(response['entities']):

                            #print idx
                            ce = entity['text'].replace("0xc2", " ")
                            ce = ce.replace("0xe2", "'")
                            ce = re.sub(
                                '(' + '|'.join(import_utilities.chars.keys()) +
                                ')', import_utilities.replace_chars, ce)
                            ce = ce.encode('utf-8')

                            try:
                                content = ce.decode('utf-8').encode(
                                    'ascii', 'xmlcharrefreplace')
                            except UnicodeDecodeError:
                                "AAAARGGGGHHH!!!!"

                            if content in bio_replaced:

                                ################################################
                                # Replace similar entities from other JSON     #
                                ################################################
                                replacement_entity = findSimilarEntityinRandomJSON(
                                    content, entity['type'])

                                cr = re.sub(
                                    '(' +
                                    '|'.join(import_utilities.chars.keys()) +
                                    ')', import_utilities.replace_chars,
                                    replacement_entity)

                                bio_replaced = bio_replaced.replace(
                                    content, replacement_entity)
                                replaced_ls.append(replacement_entity)

                # if response.get('relations') is not None:
                #     for idx,relation in enumerate(response['relations']):
                #         if 'subject' in relation:
                #             print('Subject: ', relation['subject']['text'].encode('utf-8'))

                #         if 'action' in relation:
                #             print('Action: ', relation['action']['text'].encode('utf-8'))

                #         if 'object' in relation:
                #             print('Object: ', relation['object']['text'].encode('utf-8'))

                ##########################
                #   POS REPLACMENT       #
                ##########################

                token_tuples = nltk.word_tokenize(bio_replaced)
                tt = nltk.pos_tag(token_tuples)

                #################
                #  ADJECTIVES   #
                #################
                for i in tt:
                    if "/i" not in i[0] and len(i[0]) > 3:
                        origw = re.sub(
                            '(' + '|'.join(import_utilities.chars.keys()) +
                            ')', import_utilities.replace_chars, i[0])
                        origw = import_utilities.strip_punctuation(origw)
                        if i[1] == 'JJ':
                            JJr = random.choice(JJ)
                            # # JJr =  re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, JJr)
                            # JJr = import_utilities.strip_punctuation(JJr)
                            JJr = import_utilities.moveBeginAndEndPunctuationFromStrToString(
                                i[0],
                                JJr.lstrip().lstrip())

                            if i[0].istitle():
                                JJr = JJr.title()

                            bio_replaced = re.sub(
                                r'\b' +
                                import_utilities.strip_punctuation(i[0]) +
                                r'\b', JJr, bio_replaced,
                                1)  #bio_replaced.replace(i[0],JJr,1)
                            replaced_ls.append(JJr)
                        if i[1] == 'RB':
                            RBr = random.choice(RB)
                            RBr = import_utilities.moveBeginAndEndPunctuationFromStrToString(
                                i[0],
                                RBr.lstrip().lstrip())

                            if i[0].istitle():
                                RBr = RBr.title()
                            bio_replaced = re.sub(
                                r'\b' +
                                import_utilities.strip_punctuation(i[0]) +
                                r'\b', RBr, bio_replaced, 1)
                            replaced_ls.append(RBr)

                #######################################################
                # MAIN EXCHANGE PROCESS CALL >>>>>>>   GET WN SYNSET #
                #######################################################
                # for w in bio_replaced.split(" "):

                #     print "W:",w

                #     if w not in replaced_ls and len(w)>4 and w not in import_utilities.stopwords_ls and w not in new_titles_ls:

                #         word_nopunct = import_utilities.strip_punctuation(w)

                #         if word_nopunct[-4:].lower()=="here":
                #             similarterm=random.choice(import_utilities.heres)
                #         else:
                #             #print "WORD:",word_nopunct
                #             similarterm = import_utilities.find_synset_word(word_nopunct)

                #         similarterm = import_utilities.moveBeginAndEndPunctuationFromStrToString(w,similarterm.lstrip().lstrip())
                #         print w," SIMILAR to ",similarterm
                #         bio_replaced = bio_replaced.replace(" "+w," "+similarterm)

                #         #print idx
                #         ce = entity['text'].replace("0xc2"," ")
                #         ce = ce.replace("0xe2","'")
                #         ce = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, ce)
                #         ce = ce.encode('utf-8')

                #         try:
                #             content = ce.decode('utf-8').encode('ascii', 'xmlcharrefreplace')
                #         except UnicodeDecodeError:
                #             "AAAARGGGGHHH!!!!"

                #         if content in bio_replaced:

                #             ################################################
                #             # Replace similar entities from other JSON     #
                #             ################################################
                #             replacement_entity = findSimilarEntityinRandomJSON(content,entity['type'])

                #             cr = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, replacement_entity)

                #             bio_replaced = bio_replaced.replace(content,replacement_entity)

                if len(response) > 0:
                    ALL_bios += "<br><br>~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~<br>[ A bio generated from : " + author + " ]<br><br>" + bio_replaced

                    print "<br><br>~~~~~~~~~~~~~~~~~~~~~~~~~~<br>[ A bio generated from : " + author + " ]<br><br>"
                    print bio_replaced

                    txt_fn = type_of_run + "_poetryFoundation_generatedBIOs.txt"

                    # WRITE_BIO_PATH = DATA_DIR+"generated"+datetime.datetime.now().strftime('%Y-%m-%d_%H')+"/"
                    # if not os.path.exists(WRITE_BIO_PATH):
                    #         os.makedirs(WRITE_BIO_PATH)

                    # txt_fn_path = WRITE_BIO_PATH+txt_fn
                    # f_txt=open(txt_fn_path,'w')
                    # f_txt.write(bio_replaced.encode('utf-8'))
                    # f_txt.close();
                    # print "\nTXT file created at:",txt_fn_path
                else:
                    "~~~~~~~~~~~~~~~~!!!!!!!!!! EMPTY response:", author

Exemple #6

0

Afficher le fichier

Fichier : ELO2015_PERF_Creeley-Aug4th.py Projet : amshenoy/Big-Data-Poetry

def extractFeaturesAndWritePoem(READ_PATH,file_type):
    
    

    global ALL_poems,bio,cnt,SMALL_POEM,SMALL_POEM_ALL

    inp=0
    sub_cnt=0
    words_total=0
    lines_total=0

    pause_every = 0

    for subdir, dirs, files in os.walk(READ_PATH):

        #print "randomizing",datetime.datetime.now()
        random.seed(datetime.datetime.now())
        random.shuffle(files)

        for file in files:


            
            num_of_files = len(files)-1 # deduct the DS_store
            #print (num_of_files,'readDirectory',READ_PATH)
            
            if file_type in file  and 'readme' not in file:

                JSON_alchemy_loaded = False

                # ID
                id=file.split(".")[0]
                #print "\nID:",id.split("_")[1]

                filenames.append(id)
                cnt+=1

                # print('')
                # print('')
                # print('OPENED:',id)
                # print('')
                # print('')

                ##############
                #  HOW MANY? #
                ##############
                sub_cnt+=1
                if sub_cnt>=int(inp):
                    if int(inp) != 0:
                        end_time = time.time()
                        es = end_time-start_time
                        print "\n",sub_cnt, "poems,\n",lines_total,"lines,\n",words_total,"words \ngenerated in\n",("%.2f" % es),"seconds"
                        
                    words_total=0
                    lines_total=0

                    # RESTART

                    sub_cnt=0
                    inp = raw_input("\n\n^^^^^^^^^^^^^^\n\nHow many poems do u want? ")

                    if not inp:
                        print "You entered nothing! 10 poems will be generated."
                        inp=10
                        
                    pause_every = raw_input("\nPause every 1 or 2 or ... poems?")
                    if not pause_every:
                        print "You entered nothing! Pause will occur every 10 poems."
                        pause_every=10

                    sleep_time = raw_input("\nPause for how many seconds?")
                    if not sleep_time:
                        print "You entered no time! 10 second wait assigned."
                        sleep_time=10

                    print "\n\n^^^^^^^^^^^^^^^"
                    start_time = time.time()

                print 'Poem #',sub_cnt

                poem_replaced = ""
                replacement_word = ""
                previous_replacement_word = ""
                
                author=""
                titles=""
                title=""
                new_title=""

                replaced_ls =[]
                new_titles_ls = []
                quit_language=0
                oscillator=0

                word_cnt=0

                # if EXCEPTION is raised... do not add to html
                SKIP_bool=False

                ##########################
                # Load  POEM TEXT FILE   #
                ##########################

                ##
                # PAUSE
                ##
                #time.sleep(5)

                txt_fn_path = DATA_DIR + READ_TXT_PATH + id.split("_")[1]+".txt"
                #print "txt_fn_path:",txt_fn_path

                if os.path.isfile(txt_fn_path) and cnt>0:
                    txt_data=open(txt_fn_path).read()

                    # http://blog.webforefront.com/archives/2011/02/python_ascii_co.html
                    # txt_data.decode('ISO-8859-2') .decode('utf-8')
                    # unicode(txt_data)


                    author=txt_data.split("****!****")[0].strip(' \t\n\r')
                    
                    title=txt_data.split("****!****")[1].strip(' \t\n\r')
                    
                    bio=txt_data.split("****!****")[2]#.strip(' \t\n\r')

                    ######  CLEAN BIO
                    bio.replace("\t","&#9;")
                    bio.replace("\n"," <br>")
                    bio.replace("\r"," <br>")
                    bio.replace("","~~~~!~~~")
                    poem_replaced=bio
                    #print poem_replaced

                    ###############################
                    # REPLACE AUTHOR NAME in poem #
                    ###############################
                    author_ln=author.split(" ")[-1].lstrip()
                    author_fn=author.split(" ")[:-1]
                    author = " ".join(n for n in author_fn)+author_ln
                    #
                    #poem_replaced = poem_replaced.replace(author_ln,"Jhave")

                    #######################
                    # replace BOOK TITLES #
                    #######################
                    #print "TITLES"]
                    new_title = getNewTitle("title").encode('utf-8')

                    #######################
                    # fake AUTHOR         #
                    #######################
                    
                    new_author= " ".join(random.choice(authors).split(" ")[1:-2])+" "+random.choice(authors).split(" ")[-2]
                    #print "new AUTHOR",new_author                           

                    ############################
                    # replace years with another
                    ############################
                    for w1 in poem_replaced.split("("):
                        for w2 in w1.split(")"):
                            if w2 is not None and w2.isdigit():
                                new_num = random.randint(int(w2)-5,int(w2)+5)
                                #print "REPLACING #:",w2,new_num
                                poem_replaced = poem_replaced.replace(w2,str(new_num))
                                replaced_ls.append(new_num)                            
                                               

                    #################
                    # Load JSON     #
                    #################
                    response = loadJSONfile(READ_JSON_PATH+"poetryFoundation_"+id.split("_")[1]+"_Alchemy_JSON.txt")

                    if response != "failed":

                        JSON_alchemy_loaded = True

                        if response.get('entities') is not None:
                            for idx,entity in enumerate(response['entities']):

                                #DATA clean the original words (redundant duplicate but for some reason it works... and is necessary... a kludge of crowbars and bleach)
                                ce = entity['text'].replace("0xc2"," ")
                                ce = ce.replace("0xe2","'")
                                ce = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, ce)
                                ce = ce.encode('utf-8')

                                try:
                                    content = ce.decode('utf-8').encode('ascii', 'xmlcharrefreplace')
                                except UnicodeDecodeError:
                                    "AAAARGGGGHHH!!!!"

                                if content in poem_replaced:
                                                       
                                    #################################################
                                    #                                               #
                                    # Replace similar entities from other JSON      #
                                    # Using data from ALCHEMY API                   #
                                    #                                               #
                                    #################################################
                                    replacement_entity = findSimilarEntityinRandomJSON(content,entity['type'])

                                    cr = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, replacement_entity)

                                    poem_replaced = poem_replaced.replace(content,replacement_entity)

                                    replaced_ls.append(replacement_entity)
                    

                    ##########################
                    #   POS REPLACMENT       #
                    ##########################

                    token_tuples = nltk.word_tokenize(poem_replaced)
                    tt = nltk.pos_tag(token_tuples)

                    #################
                    #  ADJECTIVES   #
                    #################
                    for i in tt:
                        if "/i" not in i[0] and len(i[0])>3 and i[0] != "died":
                            origw =  re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, i[0])
                            origw =import_utilities.strip_punctuation(origw) 
                            if i[1]=='JJ' :
                                JJr = random.choice(JJ)
                                # # JJr =  re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, JJr)
                                # JJr = import_utilities.strip_punctuation(JJr)
                                JJr = import_utilities.moveBeginAndEndPunctuationFromStrToString(i[0],JJr.lstrip().lstrip())
                                
                                if i[0].istitle():
                                    JJr = JJr.title()

                                poem_replaced = re.sub(r'\b' + import_utilities.strip_punctuation(i[0]) + r'\b', JJr, poem_replaced,1)#poem_replaced.replace(i[0],JJr,1)
                                replaced_ls.append(JJr)
                            if i[1]=='RB':
                                RBr = random.choice(RB)
                                RBr = import_utilities.moveBeginAndEndPunctuationFromStrToString(i[0],RBr.lstrip().lstrip())

                                if i[0].istitle():
                                    RBr = RBr.title()
                                poem_replaced = re.sub(r'\b' + import_utilities.strip_punctuation(i[0])  + r'\b', RBr, poem_replaced,1)
                                replaced_ls.append(RBr)


                    ########################
                    # IS IT ENGLISH?       #
                    ########################
                    for line  in poem_replaced.split('\n\r'):
                        if len(line)>0 :
                            if "english" not in import_utilities.get_language(line):
                                quit_language+=1
                                #print "NOT english:",quit_language,line
                            else:
                                quit_language-=1

                    
                    #########################
                    #   SYNSET REPLACE      #
                    #########################
                    for idx,word in enumerate(poem_replaced.split(' ')):

                        similarterm=""

                        if "<br>" not in word and "&#9;" not in word and len(word)>0:


                            words_total+=1


                            #########################
                            #   PRONOUN ' VERB      #
                            #########################
                            if len(word.split("'"))>1:
                                if word.split("'")[0] in personal_pronouns:
                                    replacement_word = random.choice(personal_pronouns)+"'"+word.split("'")[1]+' '
                                poem_replaced.replace(word,replacement_word)             
                                #print "word,",word,"replacement_word:",replacement_word
                           
                            ####################################################
                            # Replacement of OTHERs                            #
                            ####################################################

                            elif not word.lower().strip(" \n\t\r") in stopwords.words('english'):

                                # take off leading brackets, commas etc...
                                word_punct_nopunct = import_utilities.strip_punctuation_bool(word)
                                word_nopunct = word_punct_nopunct['word'].strip(" \n\t\r")
                                word_punct = word_punct_nopunct['punct']
                                punct_bool = word_punct_nopunct['punct_bool']

                             

                                #######################################################
                                # MAIN EXCHANGE PROCESS CALL >>>>>>>   GET THE SYNSET #
                                #######################################################    
                                if word_nopunct[-4:].lower()=="here":
                                    similarterm=random.choice(import_utilities.heres)
                                else:
                                    #print "WORD:",word_nopunct
                                    if len(word_nopunct)>3:

                                        oscillator  = oscillator+1
                                        
                                        ############################################
                                        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                        # STYLE SWITCH..... should in future use POS
                                        # ... i.e. if noun & oscillator%3, do...
                                        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                        ############################################
                                        # synset
                                        similarterm = import_utilities.synset_creeley(word_nopunct)
                                        #print "synset", similarterm

                                        if similarterm is not None and similarterm == word_nopunct and len(word_nopunct)>4:
                                            #RESERVOIR.sort(key=len)
                                            poetry_mouth.sort(key=len)
                                            similarterm= poetry_mouth[idx%len(poetry_mouth)]#RESERVOIR[idx%len(RESERVOIR)]
                                            #print "NEW",idx,len(RESERVOIR),similarterm,word_nopunct,"PRE>>>>>>>>LAST CHANGE STOP: ", word, "~",similarterm

                                            

                                #######################################                      
                                # abbreviations for f*****g states!   #
                                #######################################
                                if word_nopunct.upper() in import_utilities.state_abbrev and word_nopunct.lower() not in stopwords.words('english') and "me," not in word:
                                    tmp = similarterm
                                    if word_nopunct == "oh": 
                                        similarterm = random.choice(import_utilities.exclaims)
                                    else:

                                        similarterm = random.choice(poetry_mouth)#RESERVOIR)
                                    #print word_nopunct," replaced by", tmp, "replaced with:",similarterm, "in:",line

                                ##############
                                # hyphenated #
                                ##############
                                hyp =word.split("-")
                                #print word,len(hyp)
                                if len(hyp) >1:
                                    similarterm=""
                                    for w in hyp:
                                        if len(w) > 2:
                                            if import_utilities.synset_creeley(w) is not None:
                                                similarterm +=  import_utilities.synset_creeley(w)+"-"
                                            else:
                                                similarterm += w+"-"
                                    similarterm = import_utilities.strip_underscore(similarterm[:-1])
                                    #print "hyphenated:",word,"replaced by: "+similarterm

                                
                                #########################################################    
                                # is it a TRUNCATED VERB slang as in singin or wishin   #
                                #########################################################
                                # if similarterm == word_nopunct and len(word)>2 and 'in' in word_nopunct[-2:]:
                                #     similarterm = import_utilities.synset_creeley(word_nopunct+'g')
                                #     ## #print "TRUNCATED SLANG word: '"+word+"'",similarterm
                                #     interim = import_utilities.lemma(similarterm)
                                #     ## #print interim
                                #     similarterm = import_utilities.conjugate(interim, tense=import_utilities.PARTICIPLE, parse=True)[:-1] 
                                #     # # # #print word,"widx:",widx," line_pos_tags[widx][0]:",line_pos_tags[widx][0]," line_pos_tags[widx][1]:",line_pos_tags[widx][1]
                                   

                                #################      
                                # SWEAR WORD    #
                                #################
                                ##print "at the garden of if:", word
                                if word_nopunct in import_utilities.curses:
                                    similarterm = random.choice(import_utilities.curses)
                                    ##print "SWEAR WORD word: '"+word+"'",similarterm

                                                          
                                ############################################
                                # manually get rid of some terrible choices
                                ############################################
                                naw_terms=["mind","lonely"]
                                if similarterm == "ilk":
                                    similarterm = "like"

                                if similarterm == "Nox":
                                    similarterm = "oil"

                                if similarterm == "ope":
                                    similarterm = "does"

                                if similarterm == "information technology":
                                    similarterm = "it"

                                if similarterm == "velleity":
                                    similarterm = "want"

                                if similarterm == "Crataegus laevigata":
                                    similarterm = "may"

                                if similarterm == "eff":
                                    similarterm = "know"

                                if similarterm == "naw":
                                    similarterm = "mind"

                                if similarterm == "lento":
                                    similarterm = "slow"

                                #print "SIMILAR:",similarterm

                                if similarterm is not None:
                                    if len(hyp) >1:
                                        replacement_word = similarterm
                                    else:
                                        replacement_word = word.replace(word_nopunct, similarterm)
                                        replacement_word = import_utilities.strip_underscore(replacement_word)
                                        replacement_word = import_utilities.replaceNumbers(replacement_word)
                                else:
                                    replacement_word = random.choice(poetry_mouth)#RESERVOIR)

                                ################################
                                # RESERVOIR_OF_WEIRDNESS       #
                                # create a large pool of words #
                                ################################  

                                if word_nopunct.lower() in import_utilities.impera:
                                    replacement_word=random.choice(import_utilities.impera)
                                    #print word,"IMPERA:",replacement_word
                                elif word_nopunct.lower() in import_utilities.conjuncts:
                                    replacement_word=random.choice(import_utilities.conjuncts)
                                    #print word," CONJUNCTION replaced with",replacement_word
                                elif word_nopunct.lower() in import_utilities.indef_prono:
                                    replacement_word=random.choice(import_utilities.indef_prono)
                                    #print word," INDEF_prono replaced with",replacement_word
                                elif word_nopunct.lower() in import_utilities.prepo:
                                    replacement_word=random.choice(import_utilities.prepo)
                                    #print word," prepo replaced with",replacement_word
                                elif word_nopunct.lower() in import_utilities.rel_prono:
                                    replacement_word=word
                                    #print word," rel_prono LEAVE alone: ",replacement_word
                                elif word_nopunct.lower()[-2:] =="ly":
                                    if import_utilities.synset_creeley(word) is not None:
                                        replacement_word=import_utilities.strip_underscore(import_utilities.synset_creeley(word))#(word[:-2])
                                    #print word," ADVERB: ",replacement_word
                                    # if replacement_word[-2:] !="ly":
                                    #     replacement_word +="ly"
                                                                            
                                else:
                                    if len(hyp) <2 and "like" not in word_nopunct and import_utilities.singularize(word_nopunct) ==  import_utilities.singularize(replacement_word) and word_nopunct.lower() not in import_utilities.stopwords_ls:

                                        if word not in RESERVOIR and quit_language<0 and import_utilities.countPunctuation(word)<1 and len(word_nopunct)>3 and not word_nopunct.istitle(): 
                                            
                                            #print "ADDING",word,"to reservoir"
                                            #################################################
                                            # ADDING ONLY SMALL WORDS 
                                            # & MAKING A POEM OUT OF THEM
                                            #################################################
                                            if len(word)<7 and len(word)>0:
                                                small_word = word
                                                if random.randint(0,4)==3:
                                                    small_word +="\n"
                                                #print small_word
                                                small_word +=" "
                                                SMALL_POEM+=small_word

                                                RESERVOIR.append(word)
                                                #SMALL_POEM_ALL.append(small_word)
                                            
                                            replacement_word = random.choice(poetry_mouth)#RESERVOIR)#rap_mouth)# RESERVOIR)
                                            #print word_nopunct,"replaced from reservoir with", replacement_word
                                       # print "'"+word_nopunct+"'  vs RESERVOIR  replacement_word:",replacement_word #,"    new_line:",new_line
                                if quit_language>1 and not word_nopunct.istitle():
                                    #print quit_language, "Probably foreign language: make a word salad in english"
                                    replacement_word = random.choice(poetry_mouth)#RESERVOIR)#science_mouth)#RESERVOIR)
                                    #print word_nopunct,"OTHER replaced from reservoir with", replacement_word
                                
                                ###################################################
                                # MOST REPLACEMENT occurs here...                 #
                                ###################################################
                                poem_ls = poem_replaced.split(' ')
                                idx =  poem_ls.index(word)

                                # print idx,",", poem_ls[idx],",", word ,",",replacement_word
                                #print word ," --- ",previous_replacement_word,replacement_word

                                idx_2 =  poem_ls.index(word)

                                # BUG test: is potential replacement a comma or period or empty?
                                if replacement_word.lstrip().rstrip() =="," or replacement_word.lstrip().rstrip() =="" or replacement_word.lstrip().rstrip() ==".":
                                    #print "found a comma/empty why?",replacement_word.lstrip().rstrip()
                                    replacement_word=random.choice(poetry_mouth)
                                    #print "line633 REPLACING with ",replacement_word

                                if poem_ls[idx]==word and poem_ls[idx]==replacement_word:
                                    #print "SAME idx-2 replacement_word=",replacement_word
                                    # search for same grammatical type the NLTK lists
                                    replacement_word= findSamePOS(replacement_word)
                                    #print "after findSamePOS replacement_word=",replacement_word

                                #print idx,idx_2,"  poem_ls[idx_2]=", poem_ls[idx_2],"  poem_ls[idx]=", poem_ls[idx]," word=", word ,"    replacement=",replacement_word


                                if replacement_word == "~~~~!~~~" or poem_ls[idx]==  "~~~~!~~~": 
                                    print "~~~~!~~~ FOUND ******"

                                else:
                                    if poem_ls[idx]==word:
                                        poem_ls[idx]=replacement_word
                                    if poem_ls[idx_2]==word:
                                        poem_ls[idx_2]=replacement_word
                                    poem_replaced = " ".join(poem_ls)


                                # still the same? try another game
                                if len(word)>5 and replacement_word.lstrip().rstrip() == word_nopunct.lstrip().rstrip():

                                    #################################################
                                    # since word is same as replacement, try alchemy?  
                                    #################################################
                                    
                                    #replacement_entity = findSimilarEntityinRandomJSON(content,entity['type'])

                                    # a last ditch pseudo random select 
                                    # TODO USE THE NLTK LISTS TO SELECT POS WORD
                                    # RESERVOIR.sort(key=len)
                                    # replacement_word = RESERVOIR[idx%len(RESERVOIR)]
                                    #poetry_mouth.sort(key=len)


                                    #INSERTION usi
                                    #replacement_word = random.choice(poetry_mouth)#[idx%len(poetry_mouth)]
                                    replacement_word= findSamePOS(replacement_word)
                                    #print "NEWEST POS",idx,len(poetry_mouth),"LAST CHANGE STOP: ", word, "~",replacement_word

                                # check again
                                if poem_ls[idx]==word and poem_ls[idx]==replacement_word:
                                    #print "AGAIN SAME idx replacement_word=",replacement_word
                                    replacement_word=random.choice(poetry_mouth)
                                    #print "line663 AGAIN NEW rand pf=",replacement_word

                        
                                # REPLACE (but catch for weird chars)
                                try:

                                    if poem_ls[idx]==word and "****" not in word and "." != word and "\n" not in word:

                                        # INSERTION
                                        poem_ls[idx]=replacement_word
                                        #print "line673 REPLACING",poem_ls[idx]," with ",replacement_word


                                    # REASSEMBLE the poem    
                                    poem_replaced = " ".join(poem_ls)

                                    # store this word so that conjugation can be checked 
                                    previous_replacement_word=replacement_word

                                except Exception, e:
                                    #print "PENULTIMATE SKIP_bool replace FAIL",e
                                    SKIP_bool=True
                                    continue

                    ###########################################################################
                    # testing Pattern.en as parser for conjugation and article replacement    #
                    # much more robust than my hand-coded hacks                               #        
                    ###########################################################################
                    
                    # correct CONJUGATion of paticiple verbs with pattern.en
                    parsed = parse(poem_replaced,tags = True) 
                    pre_verbal = ["'m","'s","'re"]
                    for idx,p in enumerate(parsed.split(" ")):
                        tok =p.split("/")[0]
                        typ=p.split("/")[1]
                        #print idx,tok,typ
                        if tok in pre_verbal:
                            #print "pre_verbal:",tok
                            next_word= parsed.split(" ")[idx+1].split("/")

                            # try try try
                            for ix,n in enumerate(next_word): 
                                next_word[ix] = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, n).encode('utf-8')
                            try:
                                #print  next_word,next_word[0],next_word[1][:2]
                                # if it's a verb that follows
                                if next_word[1][:2] =="VB":
                                    before_verb = " ".join(w for w in poem_replaced.split(" ")[:idx])#.encode('utf-8')
                                    after_verb = " ".join(w for w in poem_replaced.split(" ")[idx+1:])#.encode('utf-8') 
                                    new_verb = conjugate(next_word[0], tense=PARTICIPLE, parse=True).encode('utf-8')
                                    # insert new
                                    #print "CONJUGATION needed, changing:",poem_replaced.split(" ")[idx],"to",parsed.split(" ")[idx],poem_replaced.split(" ")[idx-1]+" "+new_verb
                                    poem_replaced = before_verb+" "+new_verb+" "+after_verb
                            except Exception, e:
                                # print "INside parsed COnjugation loop",e
                                continue


                    # correct ARTICLES
                    for idx,word in enumerate(poem_replaced.split(" ")):
                        if len(word)>0 and idx != 0 and " " not in word:
                            # A or AN
                            if poem_replaced.split(" ")[idx-1].lower() =="a" or poem_replaced.split(" ")[idx-1].lower() =="an":
                                #print word,"---",article(word)+" "+word
                                before_article = " ".join(w for w in poem_replaced.split(" ")[:idx-1])
                                after_article = " ".join(w for w in poem_replaced.split(" ")[idx+1:])
                                new_conj = referenced(word)
                                # capitalize
                                if poem_replaced.split(" ")[idx-1].istitle():
                                    new_conj = new_conj.split(" ")[0].title()+" "+new_conj.split(" ")[1]
                                poem_replaced = before_article+" "+new_conj+" "+after_article

                    #########################
                    #  bug check ,,         #
                    #########################
                    poem_replaced = poem_replaced.replace(",,", ",")
                    poem_replaced = poem_replaced.replace(",.", ",")
                    poem_replaced = poem_replaced.replace(".,", ".")

                    #########################
                    #   WRITE SINGLE POEM   #
                    #########################
                    if not SKIP_bool:

                        tmp_poem=""   

                        # poem_replaced.replace("\t","&#9;")
                        # poem_replaced.replace("\n"," <br>")
                        # poem_replaced.replace("\r"," <br>")

                        HTML_poem=""
                        for line in poem_replaced.split("\n"):
                            #print "LINE", line
                            lines_total+=1
                            HTML_poem += line+"<br>"

                        if len(response) >0 and len(id.split("_"))>1:

                            ALL_poems = "<br>[ A  generated-poem based upon: <i>"+ title +"</i> by <b>"+ author+"</b>]<br><br><i>"+new_title+"</i><br> by <b>"+ new_author   +"</b><br>"+HTML_poem+ALL_poems.split("</h2>")[1].replace("  ","&nbsp")

                            tmp_poem= "\n[A generated-poem based upon: '"+ title+"' by "+ author +"]\n\n"+new_title+ "\nby "+new_author+"\n"+poem_replaced
  
                            #####################
                            #                   #
                            #                   #
                            #     PAUSE IT      #
                            #                   #
                            #                   #
                            #####################

                            if (int(sub_cnt)%int(pause_every) == 0 and int(sub_cnt) !=0):
                                time.sleep(int(sleep_time))

                            #####################
                            #                   #
                            #                   #
                            #       PRINT       #
                            #                   #
                            #                   #
                            #####################

                            print "\n~~~\n"  +tmp_poem+"\n~~~\n" 

                            # SLOW TYPEWRITER PRESENTATION
                            # for line in tmp_poem:
                            #    for c in line:
                            #         time.sleep(0.04)
                            #         sys.stdout.write(c)#(c.encode("utf8"))
                            #         sys.stdout.flush()
# 
                            #sys.stdout.write("\n")

                            txt_fn = id.split("_")[1]+"_POEMs.txt"

                            WRITE__PATH = "../../generated/poetryFoundation/"+poem_style+datetime.datetime.now().strftime('%Y-%m-%d_%H')+"/"
                            if not os.path.exists(WRITE__PATH):
                                    os.makedirs(WRITE__PATH)

                            txt_fn_path = WRITE__PATH+txt_fn
                            f_txt=open(txt_fn_path,'w')
                            f_txt.write(tmp_poem)#.encode('utf-8'))       
                            f_txt.close();   
                            #print "\nTXT file created at:",txt_fn_path

                            WRITE__PATH = "../../generated/poetryFoundation/"+poem_style+"_SMALL_POEMS"+datetime.datetime.now().strftime('%Y-%m-%d_%H')+"/"
                            if not os.path.exists(WRITE__PATH):
                                    os.makedirs(WRITE__PATH)
                            txt_fn_path = WRITE__PATH+txt_fn
                            f_txt=open(txt_fn_path,'w')
                            f_txt.write("[A generated-poem based upon: '"+ title+"' by "+ author +"]\n\n"+SMALL_POEM)#.encode('utf-8'))       
                            f_txt.close(); 
                            SMALL_POEM=""  
                            
                            #######
                            #   write them all.... wasteful... but useful if run is interrupted....
                            ###########  

                            # if cnt==1:
                            #     ALL_poems = ALL_poems_intro+ALL_poems
                            # else:
                            ALL_poems = ALL_poems_intro+ALL_poems.replace("  ","&nbsp")
                            ALL_poems = ALL_poems.replace("$$datetime$$",datetime.datetime.now().strftime('%Y-%m-%d at %H:%M'))
                            ALL_poems = ALL_poems.replace("$$cnt$$",str(cnt))
                            ALL_poems = ALL_poems.replace("$$style$$",poem_style)
                            ALL_poems = ALL_poems.replace("$$gentime$$",str(time.time() - start_time))

                            # ALL POEMS
                            txt_fn = datetime.datetime.now().strftime('%Y-%m-%d')+"_BDP_generated_"+poem_style+"_POEMS_"+str(poem_id)+".html"
                            

                            GEN_PATH = GENERATED_DIR+type_of_run+"_html/"
                            if not os.path.exists(GEN_PATH):
                                    os.makedirs(GEN_PATH)

                            txt_fn_path = GEN_PATH+txt_fn

                            f_txt=open(txt_fn_path,'w')
                            f_txt.write(ALL_poems+"</hmtl>")       
                            f_txt.close();   
                            #print "\nTXT file created at:",txt_fn_path
                        # except Exception, e:
                        #         print "At the final LOOP",e
                        #         #continue
                        #         pass


                        else:
                            pass
                            #print "~! EMPTY response:", author

                    else:
                        cnt = cnt-1