def parserNegativeVerb(): with open("data/negative_verbs_dump.txt", "r") as f: words_to_check = f.readlines() words_to_check = [x.strip() for x in words_to_check] sentences_containing_words = [] for sentence in sentences: for word in words_to_check: if sentence.lower().find(word) != -1: sentences_containing_words.append(sentence) break with open('data/negative_verbs_sentences_dump.txt', 'w+') as f: f.writelines("%s\n" % line for line in sentences_containing_words) with open('data/negative_verbs_sentences_dump.txt', 'r') as f: sentences = f.readlines() sentences = [x.strip() for x in sentences] from finnish_toolkit import parser mp = parser.Parser() parsed_sentences = [] for i in range(30): sentence = sentences[i] parsed_sentence = mp.parse(sentence) parsed_sentences.append(parsed_sentence) import os dump_folder = './data/visualizations/negative_verbs' if not os.path.exists(dump_folder): os.makedirs(dump_folder) for i, parsed_sentence in enumerate(parsed_sentences): parser.visualize( parsed_sentence, '{}/{}.html'.format(dump_folder, i))
def parse_most_frequent_named_entity_sentences(): with open('data/five_diseases_sentences_dump.txt', 'r') as f: sentences = f.readlines() sentences = [x.strip() for x in sentences] from finnish_toolkit import parser mp = parser.Parser() parsed_sentences = [] for i in range(30): sentence = sentences[i] parsed_sentence = mp.parse(sentence) parsed_sentences.append(parsed_sentence) import os dump_folder = './data/visualizations/five_diseases' if not os.path.exists(dump_folder): os.makedirs(dump_folder) for i, parsed_sentence in enumerate(parsed_sentences): parser.visualize( parsed_sentence, '{}/{}.html'.format(dump_folder, i))
def getTopicVariation(): topics=[] for fileN in listdir("./post_process/time_lemmatized"): if fileN.endswith(".txt"): filename="./post_process/time_lemmatized/{}".format(fileN) else: continue with open(filename) as f: print(filename) sentencespre = f.read() f.close() topic=overallTopic(sentencespre) print(topic) topics.append(topic) return topics sentiment=[] for fileN in listdir("./post_process/time"): if fileN.endswith(".txt"): filename="./post_process/time/{}".format(fileN) else: continue with open(filename) as f: print(filename) sentencespre = f.read() f.close() sentiment_item=overallSentiment(sentencespre) print(sentiment_item) sentiment.append(sentiment_item) return sentiment with open('data/five_diseases_sentences_dump.txt', 'r') as f: sentences = f.readlines() sentences = [x.strip() for x in sentences] from finnish_toolkit import parser mp = parser.Parser() parsed_sentences = [] for i in range(30): sentence = sentences[i] parsed_sentence = mp.parse(sentence) parsed_sentences.append(parsed_sentence) import os dump_folder = './data/visualizations/five_diseases' if not os.path.exists(dump_folder): os.makedirs(dump_folder) for i, parsed_sentence in enumerate(parsed_sentences): parser.visualize( parsed_sentence, '{}/{}.html'.format(dump_folder, i)) # sent = overallSentiment(sentencespre) # overall_sent = [overallSentiment(sentence) for sentence in sentences] # sentences=fetchSentences() # histogramNER() # commoncooccurred = mostCooccuring() # print(overallTopic(". ".join(sent) for sent in sentences)) # analyzeFiles() # translationFunc() # getTopicVariation() # getSentVariation() # translateDivided() # extract_sentences_of_most_frequent_named_entities() # parse_most_frequent_named_entity_sentences()