def add_polarity_to_synsets(id_words, _state_queue=None, _id_process=None): """ Adds the positive/negative/objective polarities of all the synsets currently in the table Synset, from the SentiWordNet corpus. .. note:: This function should be used only inside the :func:`file_process.add_files()` function. """ from nltk.corpus import sentiwordnet as swn from loacore.load import synset_load from loacore.utils.db import safe_commit, safe_execute from loacore.conf import DB_TIMEOUT conn = sql.connect(DB_PATH, timeout=DB_TIMEOUT) c = conn.cursor() synsets = synset_load.load_synsets( id_synsets=synset_load.get_id_synsets_for_id_words(id_words)) synset_count = 0 total_synset = len(synsets) for synset in synsets: # Print state synset_count += 1 _commit_polarity_state(_state_queue, _id_process, synset_count, total_synset) synset.pos_score = swn.senti_synset(synset.synset_name).pos_score() if synset.pos_score is not None: # There is an entry in the SentiWordNet database for our synset synset.neg_score = swn.senti_synset(synset.synset_name).neg_score() synset.obj_score = 1 - (synset.pos_score + synset.neg_score) safe_execute( c, "UPDATE Synset SET (Pos_Score, Neg_Score, Obj_Score) " "= (" + str(synset.pos_score) + ", " + str(synset.neg_score) + ", " + str(synset.obj_score) + ") " "WHERE Id_Synset = " + str(synset.id_synset), 0, _state_queue, _id_process) if _state_queue is None: print("") safe_commit(conn, 0, _state_queue, _id_process) conn.close()
def add_synsets_to_sentences(sentences, print_synsets=False, _state_queue=None, _id_process=None, freeling_modules=None): """ Performs a Freeling process to disambiguate words of the sentences according to their context (UKB algorithm) linking them to a unique synset (if possible).\n Our sentences are converted to Freeling Sentences before processing.\n Notice that even if we may have already computed the Lemmas for example, Freeling Sentences generated from our sentences are "raw sentences", without any analysis linked to their Words. So we make all the Freeling process from scratch every time, except *tokenization* and *sentence splitting*, to avoid any confusion. .. note:: This function should be used only inside the file_process.add_files() function. :param sentences: Sentences to process :type sentences: :obj:`list` of |Sentence| :param print_synsets: If True, print disambiguation results :type print_synsets: boolean """ from loacore.conf import DB_TIMEOUT from loacore.utils.db import safe_commit, safe_execute freeling_sentences = [ sentence.compute_freeling_sentence() for sentence in sentences ] if freeling_modules is None: if _state_queue is not None: _state_queue.put( ProcessState(_id_process, os.getpid(), "Loading Freeling...", " - ")) morfo, tagger, sen, wsd = init_freeling() else: morfo, tagger, sen, wsd = freeling_modules _disambiguation_state(_state_queue, _id_process) # perform morphosyntactic analysis and disambiguation processed_sentences = morfo.analyze(freeling_sentences) processed_sentences = tagger.analyze(processed_sentences) # annotate and disambiguate senses processed_sentences = sen.analyze(processed_sentences) processed_sentences = wsd.analyze(processed_sentences) # Copy freeling results into our Words for s in range(len(sentences)): sentence = sentences[s] if not len(sentence.words) == len(processed_sentences[s]): print("/!\\ Warning, sentence offset error in synset_process /!\\") print(sentence.sentence_str()) print([w.get_form() for w in processed_sentences[s]]) for w in range(len(sentence.words)): word = sentence.words[w] rank = processed_sentences[s][w].get_senses() if len(rank) > 0: if not rank[0][0][0] == '8': # ignore synsets offsets 8.......-. # they are odd synsets that WordNet can't find... word.synset = Synset(None, word.id_word, rank[0][0], wn.of2ss(rank[0][0]).name(), None, None, None) if print_synsets: print("Word : " + word.word) print("Synset code : " + rank[0][0]) print("Synset name : " + wn.of2ss(rank[0][0]).name()) # Add synsets to database conn = sql.connect(DB_PATH, timeout=DB_TIMEOUT) c = conn.cursor() sentence_count = 0 total_sentence = len(sentences) for sentence in sentences: # Print state sentence_count += 1 _commit_state(_state_queue, _id_process, sentence_count, total_sentence) for word in sentence.words: synset = word.synset if synset is not None: # Add synset safe_execute( c, "INSERT INTO Synset (ID_Word, Synset_Code, Synset_Name) " "VALUES (?, ?, ?)", 0, _state_queue, _id_process, mark_args=(word.id_word, synset.synset_code, synset.synset_name)) # Get back id of last inserted review safe_execute(c, "SELECT last_insert_rowid()", 0, _state_queue, _id_process) id_synset = c.fetchone()[0] # Update Word table safe_execute( c, "UPDATE Word SET ID_Synset = " + str(id_synset) + " WHERE ID_Word = " + str(word.id_word), 0, _state_queue, _id_process) safe_commit(conn, 0, _state_queue, _id_process) conn.close()
def add_sentences_from_reviews(reviews, _state_queue=None, _id_process=None, freeling_modules=None): """ Performs the first Freeling process applied to each normalized review.\n Each review is tokenized, and then splitted into sentences, thanks to corresponding Freeling modules.\n A representation of the Sentences and their Words (tokens) are then added to corresponding tables. .. note:: This function should be used only inside the :func:`file_process.add_files()` function. :param reviews: Reviews to process :type reviews: :obj:`list` of |Review| :return: added sentences :rtype: :obj:`list` of |Sentence| """ from loacore.classes.classes import Word from loacore.utils.db import safe_commit, safe_execute from loacore.conf import DB_TIMEOUT if freeling_modules is None: if _state_queue is not None: _state_queue.put( ProcessState(_id_process, os.getpid(), "Loading Freeling...", " - ")) morfo, tk, sp = init_freeling() else: morfo, tk, sp = freeling_modules conn = sql.connect(DB_PATH, timeout=DB_TIMEOUT) c = conn.cursor() added_sentences = [] review_count = 0 try: total_review = len(reviews) except TypeError: # Review is a ReviewIterator, unkown length. total_review = " - " for review in reviews: # Print state review_count += 1 _tokenization_state(_state_queue, _id_process, review_count, total_review) raw_review = review.review tokens = tk.tokenize(raw_review) sentences = sp.split(tokens) sentences = morfo.analyze(sentences) review_index = 0 for sentence in sentences: if len(sentence) <= 50: review_sentence = Sentence(None, review.id_review, review_index, None) review_index += 1 # Add words sentence_index = 0 for word in sentence: review_sentence.words.append( Word(None, None, sentence_index, word.get_form(), None, None, None)) sentence_index += 1 review.sentences.append(review_sentence) sentence_count = 0 total_sentence = len([s for r in reviews for s in r.sentences]) for r in reviews: for s in r.sentences: # Print state sentence_count += 1 _commit_state(_state_queue, _id_process, sentence_count, total_sentence) # Add sentence safe_execute(c, "INSERT INTO Sentence (ID_Review, Review_Index) " "VALUES (?, ?)", 0, _state_queue, _id_process, mark_args=(s.id_review, s.review_index)) # Get back id of last inserted sentence safe_execute(c, "SELECT last_insert_rowid()", 0, _state_queue, _id_process) id_sentence = c.fetchone()[0] s.id_sentence = id_sentence sql_words = [] for w in s.words: w.id_sentence = id_sentence sql_words.append((id_sentence, w.sentence_index, w.word)) safe_execute( c, "INSERT INTO Word (ID_Sentence, Sentence_Index, word) VALUES (?, ?, ?)", 0, _state_queue, _id_process, mark_args=sql_words, execute_many=True) added_sentences.append(s) if _state_queue is None: print("") safe_commit(conn, 0, _state_queue, _id_process) conn.close() return added_sentences
def add_dep_tree_from_sentences(sentences, print_result=False, _state_queue=None, _id_process=None, freeling_modules=None): """ Generates the dependency trees of the specified sentences and add the results to the database.\n Sentences are firstly converted into "raw" Freeling sentences (without any analysis) and then all the necessary Freeling processes are performed.\n The PoS_tag of words are also computed and added to the database in this function.\n .. note:: This function should be used only inside the :func:`file_process.add_files()` function. .. note:: This process can be quite long. (at least a few minutes) :param sentences: Sentences to process :type sentences: :obj:`list` of |Sentence| :param print_result: Print PoS_tags and labels associated to each |Word| :type print_result: boolean """ from loacore.utils.db import safe_commit, safe_execute from loacore.conf import DB_TIMEOUT if freeling_modules is None: if _state_queue is not None: _state_queue.put( ProcessState(_id_process, os.getpid(), "Loading Freeling...", " - ")) morfo, tagger, sen, wsd, parser = init_freeling() else: morfo, tagger, sen, wsd, parser = freeling_modules freeling_sentences = [ sentence.compute_freeling_sentence() for sentence in sentences ] # Print state _parsing_state(_state_queue, "DT Tagging...", _id_process) # perform morphosyntactic analysis processed_sentences = morfo.analyze(freeling_sentences) processed_sentences = tagger.analyze(processed_sentences) # Print state _parsing_state(_state_queue, "DT Disambiguation...", _id_process) # annotate and disambiguate senses processed_sentences = sen.analyze(processed_sentences) processed_sentences = wsd.analyze(processed_sentences) # Print state _parsing_state(_state_queue, "Dep Tree Parsing...", _id_process) # Dependency tree parsing processed_sentences = parser.analyze(processed_sentences) conn = sql.connect(DB_PATH, timeout=DB_TIMEOUT) c = conn.cursor() sentence_count = 0 total_sentence = len(sentences) for s in range(len(sentences)): # Print State sentence_count += 1 _commit_state(_state_queue, _id_process, sentence_count, total_sentence) sentence = sentences[s] # Add dep_tree to database dt = processed_sentences[s].get_dep_tree() dep_tree = DepTree(None, None, sentence.id_sentence) safe_execute(c, "INSERT INTO Dep_Tree (ID_Sentence) VALUES (?)", 0, _state_queue, _id_process, mark_args=[dep_tree.id_sentence]) # Get back id_dep_tree safe_execute(c, "SELECT last_insert_rowid()", 0, _state_queue, _id_process) id_dep_tree = c.fetchone()[0] dep_tree.id_dep_tree = id_dep_tree # Database process root = None if not len(sentence.words) == len(processed_sentences[s]): print( "/!\\ Warning, sentence offset error in deptree_process /!\\") print(sentence.sentence_str()) print([w.get_form() for w in processed_sentences[s]]) for w in range(len(sentence.words)): word = sentence.words[w] rank = processed_sentences[s][w].get_senses() if len(rank) > 0: word.PoS_tag = processed_sentences[s][w].get_tag() if print_result: print("Word : " + word.word) print("PoS_tag : " + processed_sentences[s][w].get_tag()) print("Label : " + dt.get_node_by_pos(w).get_label()) # We use the get_node_by_pos function to map the tree to our sentence node = dt.get_node_by_pos(w) dep_tree_node = DepTreeNode(None, id_dep_tree, word.id_word, node.get_label(), 0) if node == dt.begin(): dep_tree_node.root = 1 root = dep_tree_node # Add DepTreeNode to database safe_execute( c, "INSERT INTO Dep_Tree_Node (ID_Dep_Tree, ID_Word, Label, root) " "VALUES (?, ?, ?, ?)", 0, _state_queue, _id_process, mark_args=(dep_tree_node.id_dep_tree, dep_tree_node.id_word, dep_tree_node.label, dep_tree_node.root)) # Get back id_dep_tree_node safe_execute(c, "SELECT last_insert_rowid()", 0, _state_queue, _id_process) id_dep_tree_node = c.fetchone()[0] dep_tree_node.id_dep_tree_node = id_dep_tree_node # Use the freeling set_node_id function to store our db node id in the freeling node node.set_node_id(str(id_dep_tree_node)) # Add PoS_tag to Word if word.PoS_tag is not None: safe_execute( c, "UPDATE Word SET PoS_tag = '" + word.PoS_tag + "' " "WHERE ID_Word = " + str(word.id_word), 0, _state_queue, _id_process) # Add dep_tree root to database dep_tree.root = root safe_execute( c, "UPDATE Dep_Tree SET ID_Dep_Tree_Node = " + str(root.id_dep_tree_node) + " " "WHERE ID_Dep_Tree = " + str(id_dep_tree), 0, _state_queue, _id_process) # Add children relations root_node = dt.begin() _rec_children(c, root_node, _state_queue, _id_process) if _state_queue is None: print("") safe_commit(conn, 0, _state_queue, _id_process) conn.close()
def add_lemmas_to_sentences(sentences, print_lemmas=False, _state_queue=None, _id_process=None, freeling_modules=None): """ Performs a Freeling process to add lemmas to words.\n However, the argument is actually a sentence to better fit Freeling usage.\n Our sentences will be converted to a Freeling Sentences before processing. .. note:: This function should be used only inside the :func:`file_process.add_files()` function. :param sentences: Sentences to process :type sentences: :obj:`list` of |Sentence| :param print_lemmas: If True, print lemmatization results :type print_lemmas: boolean """ from loacore.utils.db import safe_commit, safe_execute from loacore.conf import DB_TIMEOUT freeling_sentences = [ sentence.compute_freeling_sentence() for sentence in sentences ] if freeling_modules is None: if _state_queue is not None: _state_queue.put( ProcessState(_id_process, os.getpid(), "Loading Freeling...", " - ")) morfo = init_freeling() else: morfo = freeling_modules # Print sentence _lemmatization_state(_state_queue, _id_process) processed_sentences = morfo.analyze(freeling_sentences) # Copy freeling results into our Words for s in range(len(sentences)): sentence = sentences[s] if not len(sentence.words) == len(processed_sentences[s]): print("/!\\ Warning, sentence offset error in lemma_process /!\\") print(sentence.sentence_str()) print([w.get_form() for w in processed_sentences[s]]) for w in range(len(sentence.words)): word = sentence.words[w] word.lemma = processed_sentences[s][w].get_lemma() if print_lemmas: print(word.word + " : " + word.lemma) # Add lemmas to database conn = sql.connect(DB_PATH, timeout=DB_TIMEOUT) c = conn.cursor() sentence_count = 0 total_sentence = len(sentences) _commit_state(_state_queue, _id_process, " - ", " - ") for sentence in sentences: # Print state sentence_count += 1 _commit_state(_state_queue, _id_process, sentence_count, total_sentence) for word in sentence.words: # Add Lemma to Lemma Table safe_execute(c, "INSERT INTO Lemma (Lemma, ID_Word) VALUES (?, ?)", 0, _state_queue, _id_process, mark_args=(word.lemma, word.id_word)) # Get back id of last inserted lemma safe_execute(c, "SELECT last_insert_rowid()", 0, _state_queue, _id_process) id_lemma = c.fetchone()[0] # Update Word table safe_execute( c, "UPDATE Word SET ID_Lemma = " + str(id_lemma) + " WHERE ID_Word = " + str(word.id_word), 0, _state_queue, _id_process) if _state_queue is None: print("") safe_commit(conn, 0, _state_queue, _id_process) conn.close()