def combifyData(DF): grouped_df = DF.groupby('Related Terms') dataframes = pd.DataFrame(columns=['Term','Heading','Spintext']) #print(DF) for key, item in grouped_df: relatedTerm = key texd = [] #print(relatedTerm, "\n\n") for heading in item["Heading"]: texd.append(heading) text = ' '.join(texd) #print(text) combos = combify(text, 1, stop_words=stopwords) average = combos["NUMBER_OF_TIMES_FOUND"].mean() # calculate average average = average + 1 # Add 1 to average df = combos[combos["NUMBER_OF_TIMES_FOUND"] > average] # Filter value based on criteria df= df.sort_values(by="NUMBER_OF_TIMES_FOUND", ascending=False) #Sort in Desc orderedCombos = tuple(list(df.index)) dfObj = createTopic(grouped_df.get_group(key),relatedTerm,orderedCombos) # function call for each (DF for that related term , related term , and particular combo) spintaxDF = combifyTopic(dfObj) #print (dfObj) frames = [dataframes,spintaxDF] dataframes = pd.concat(frames) # path='Output\\' # dfObj.to_csv(path+relatedTerm+'.csv',index=False, encoding='utf-8') dataframes.to_csv("Spintax.csv",index=False, encoding='utf-8') print(dataframes)
def combifyTopic(DF): #print (DF) term = str(DF['Related Term'].iloc[0]) grouped_topic = DF.groupby('Topic') fd = pd.DataFrame(columns=['Term','Heading','Spintext']) for key, item in grouped_topic: dfObj = pd.DataFrame(columns=['Sentence','Word','Heading']) # new everytime #print("Topic : ",key) Topic = key text,headingText = groupFix(grouped_topic.get_group(key),key) #Calculate Combos for paragraphs combos = combify(text, 1, stop_words=stopwords) # returns datafram average = combos["NUMBER_OF_TIMES_FOUND"].mean() # calculate average average = average + 1 # Add 1 to average df = combos[combos["NUMBER_OF_TIMES_FOUND"] > average] # Filter value based on criteria df= df.sort_values(by="NUMBER_OF_TIMES_FOUND", ascending=False) #Sort in Desc combos = tuple(list(df.index)) # dataframe to list #print("Combos : ",combos) sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') # Get sentences from paragraphs sentences = sent_detector.tokenize(text.strip()) #print("Sentences : ",sentences) for sent in sentences: for word in combos: result = findWholeWord(word)(sent) #print(findWholeWord(word)(sent)) #result = findSentences(word.lower(),sent.lower()) if result is not None: #word,sentence = result dfObj = dfObj.append({'Word':word,'Sentence': sent,'Heading':headingText}, ignore_index=True) #print(dfObj) if not dfObj.empty: dfObj = dfObj.groupby('Sentence').agg({'Word': ', '.join,'Heading':'first'}).reset_index() #Group by Heading dfObj = dfObj[['Sentence','Word','Heading']] #Re-arranging again dt = dfObj.sort_values(by="Word", ascending=False) #dt['Word'] = dt['Word'].str.split(',').sort_values() dt['Count'] = dt.groupby('Word')['Word'].transform('count') dt['Len'] = dt['Sentence'].str.len() # count chracters dt= dt.sort_values(by="Count", ascending=False) #Sort in Desc dt = dt[dt["Count"] >= 2] # path='Output\\' # name = path+key+ '('+term+')' # filename = "%s.csv" % name # dt.to_csv(filename,index=False, encoding='utf-8-sig') ####Spintax### if not dt.empty: response = generateSpintax(dt,combos) headingTextforThis = str(dt['Heading'].iloc[0]) headingTextforThis = ('{' + headingTextforThis + '}') fd = fd.append({'Term':term,'Heading':headingTextforThis, 'Spintext':response}, ignore_index=True) # filename = "%s.csv" % name # fd.to_csv(filename,index=False, encoding='utf-8-sig') #print(fd) #retruns 1 terms alll topic spintax return (fd)
def all_data(list_of_links): dataframes = [] for link in list_of_links: try: dataframes.append(smash_and_grab(link)) except: print("failed to grab", link) df = pd.concat(dataframes, ignore_index=True) print("grabbed data") df = df[["url", "header", "first line"]] headings = "" for head in df["header"]: headings += head most_words = combify(headings, 1, stopwords) most_words.sort_values(by="NUMBER_OF_TIMES_FOUND", inplace=True, ascending=False) topics = [] for i in df.index: head_to_check = df.at[i, "header"] small_dict = {"header": head_to_check} word_list = list(most_words.index) for word in word_list: if word in head_to_check.lower(): small_dict["First topic word"] = word word_list.pop(word_list.index(word)) break for word in word_list: if word in head_to_check.lower(): small_dict["Second topic word"] = word word_list.pop(word_list.index(word)) break if "Second topic word" not in small_dict.keys(): small_dict["Second topic word"] = "" for word in word_list: if word in head_to_check.lower(): small_dict["Third topic word"] = word word_list.pop(word_list.index(word)) break if "Third topic word" not in small_dict.keys(): small_dict["Third topic word"] = "" for q in ["how", "what", "who", "where", "when"]: if q in head_to_check.lower(): small_dict["Q word"] = q if "Q word" not in small_dict.keys(): small_dict["Q word"] = "" topics.append(small_dict) topics_info = pd.DataFrame(topics) topics_info = topics_info[[ "header", "First topic word", "Second topic word", "Third topic word", "Q word" ]] print(topics_info.head(10)) df = pd.merge(df, topics_info, on="header", left_index=True, right_index=True, how="right") print(df.head()) texd = "" for words in df["first line"]: words = words.strip() texd += words combos = combify(texd, 1, stop_words=stopwords, limit=2) combos.to_csv("Combo output for thingy for MTD.csv") useful_combos = {} for combination in combos.index: useful_combos[combination] = combos.at[combination, "NUMBER_OF_TIMES_FOUND"] df["BIN"] = "Bin" for topics in [ "First topic word", "Second topic word", "Third topic word" ]: for i in df.index: if topic_binner(df.at[i, topics]): df.at[i, "BIN"] = "keep" df = df[df["BIN"] == "keep"] df = df[[useful_para(useful_combos, x) for x in df["first line"]]] df["head topics"] = df["First topic word"] + df["Second topic word"] + df[ "Third topic word"] df.drop(columns="BIN", inplace=True) como_list = [x for x in useful_combos.keys()] df["score"] = [combo_counter(como_list, x) for x in df["first line"]] topic_scores = {} for i in df.index: head_t = df.at[i, "head topics"] score = df.at[i, "score"] if head_t in topic_scores: if score > topic_scores[head_t]["score"]: topic_scores[head_t]["score"] = score topic_scores[head_t]["heading"] = df.at[i, "header"] topic_scores[head_t]["paragraph"] = df.at[i, "first line"] else: topic_scores[head_t] = {} topic_scores[head_t]["topic"] = head_t topic_scores[head_t]["score"] = score topic_scores[head_t]["heading"] = df.at[i, "header"] topic_scores[head_t]["paragraph"] = df.at[i, "first line"] article = pd.DataFrame.from_dict(topic_scores, "index") article.replace("\d+", "", True, regex=True) article.replace("\£", "", True, regex=True) article.sort_values(by="score", inplace=True, ascending=False) article = article.head(30) article = article.reset_index() article.to_csv("Output for thingy for MTD.csv", index=False) text = "" count = 1 for each in article["paragraph"]: addition = "||" + str(count) + "|| " text += addition como_list.append(addition) count += 1 text += each + "\n" shit_balls = open("shitballs.txt", "w+") shit_balls.write(text) shit_balls.close() shittier_balls = open("shittier balls.txt", "w+") response = rewriter.api._transform_plain_text('unique_variation', text, como_list, 'high') shittier_balls.write(response["response"]) shittier_balls.close() post_text = response["response"] text_list = [x for x in post_text.split("||")] stored_number = 0 fin_article = {} for line in text_list: if len(line.strip()) > 0: if number_check(line.strip()): stored_number = line else: fin_article[int(stored_number) - 1] = line print(stored_number, line) post_data = pd.DataFrame.from_dict(fin_article, orient='index', columns=["spun text"]) output = article.merge(post_data, left_index=True, right_index=True) output.to_csv("FINISHeD THING.csv", index=False)