def safe_commit(conn, try_number, state_queue, id_process): import os import time import sqlite3 as sql from loacore.utils.status import ProcessState from loacore.conf import MAX_DB_COMMIT_ATTEMPTS try: conn.commit() except sql.OperationalError: if try_number <= MAX_DB_COMMIT_ATTEMPTS: try_number += 1 if id_process is not None: print("[Process " + str(id_process) + "] Commit attempt number : " + str(try_number)) else: print("Commit attempt number : " + str(try_number)) if state_queue is not None: state_queue.put( ProcessState(id_process, os.getpid(), "DB failed, retry.", str(try_number))) time.sleep(10) safe_commit(conn, try_number + 1, state_queue, id_process) else: if state_queue is not None: state_queue.put( ProcessState(id_process, os.getpid(), "DB commit failed.", " X ")) if id_process is not None: print("[Process " + str(id_process) + "] Commit fail.") else: print("Commit fail.")
def safe_execute(c, request, try_number, state_queue, id_process, mark_args=None, execute_many=False): import os import time import sqlite3 as sql from loacore.utils.status import ProcessState from loacore.conf import MAX_DB_COMMIT_ATTEMPTS try: if mark_args is not None: if not execute_many: c.execute(request, mark_args) else: c.executemany(request, mark_args) else: c.execute(request) except sql.OperationalError: if try_number <= MAX_DB_COMMIT_ATTEMPTS: try_number += 1 if id_process is not None: print("[Process " + str(id_process) + "] Execute attempt number : " + str(try_number)) else: print("Execute attempt number : " + str(try_number)) if state_queue is not None: state_queue.put( ProcessState(id_process, os.getpid(), "DB failed, retry.", str(try_number))) time.sleep(10) safe_execute(c, request, try_number + 1, state_queue, id_process, mark_args=mark_args) else: if state_queue is not None: state_queue.put( ProcessState(id_process, os.getpid(), "DB execute failed.", " X ")) if id_process is not None: print("[Process " + str(id_process) + "] Execute fail.") else: print("Execute fail.")
def printer(stdscr): try: refresh_count = 0 def plot_window(): nonlocal refresh_count refresh_count += 1 if refresh_count >= 1000: refresh_count = 0 stdscr.clear() stdscr.move(0, 0) stdscr.addstr(0, 0, "Process") stdscr.addstr(0, 14, "PID") stdscr.addstr(0, 21, "Activity") stdscr.addstr(0, 45, "Progress") stdscr.move(0, 0) stdscr.chgat(curses.A_REVERSE) for i in range(min(curses.LINES - 1, num_process)): items = processes[i + 1].state_str() stdscr.move(i + 1, 0) stdscr.clrtoeol() stdscr.addstr(i + 1, 0, items[0]) stdscr.addstr(i + 1, 14, items[1]) stdscr.addstr(i + 1, 21, items[2]) stdscr.addstr(i + 1, 45, items[3]) if num_process + 1 <= curses.LINES: stdscr.move(num_process + 1, 0) stdscr.refresh() print("Printer initialized") for n in unterminated_processes: processes[n] = ProcessState(n, "-", "Waiting", "-") old_lines = curses.LINES while len(unterminated_processes) > 0: curses.update_lines_cols() if curses.LINES != old_lines: plot_window() old_lines = curses.LINES while not q.empty(): state = q.get() processes[state.id_process] = state if state.activity == "Terminated" or state.activity == "DB error": unterminated_processes.remove(state.id_process) plot_window() import os from loacore.conf import OUTPUT_PATH f = open(os.path.join(OUTPUT_PATH, "result.log"), "w") for i in processes.keys(): items = processes[i].state_str() f.write(items[0] + '\t' + items[1] + '\t' + items[2] + '\t' + items[3] + '\t\n') f.close() except: from loacore.conf import OUTPUT_PATH file = open(os.path.join(OUTPUT_PATH, "debug_curse.txt")) file.close()
def _commit_state(state_queue, id_process, sentence_count, total_sentence): if state_queue is not None: state_queue.put( ProcessState(id_process, os.getpid(), "Synset DB commit...", str(sentence_count) + " / " + str(total_sentence))) else: print("\r" + str(sentence_count) + " / " + str(total_sentence) + " sentences added.", end="")
def _tokenization_state(state_queue, id_process, review_count, total_review): if state_queue is not None: state_queue.put( ProcessState(id_process, os.getpid(), "Tokenization", str(review_count) + " / " + str(total_review))) else: print("\r" + str(review_count) + " / " + str(total_review) + " reviews processed.", end="")
def _commit_polarity_state(state_queue, id_process, sentence_count, total_sentence): if state_queue is not None: state_queue.put( ProcessState(id_process, os.getpid(), "Add polarity to synset", str(sentence_count) + " / " + str(total_sentence))) else: print("\r" + str(sentence_count) + " / " + str(total_sentence) + " polarities added.", end="")
def add_synsets_to_sentences(sentences, print_synsets=False, _state_queue=None, _id_process=None, freeling_modules=None): """ Performs a Freeling process to disambiguate words of the sentences according to their context (UKB algorithm) linking them to a unique synset (if possible).\n Our sentences are converted to Freeling Sentences before processing.\n Notice that even if we may have already computed the Lemmas for example, Freeling Sentences generated from our sentences are "raw sentences", without any analysis linked to their Words. So we make all the Freeling process from scratch every time, except *tokenization* and *sentence splitting*, to avoid any confusion. .. note:: This function should be used only inside the file_process.add_files() function. :param sentences: Sentences to process :type sentences: :obj:`list` of |Sentence| :param print_synsets: If True, print disambiguation results :type print_synsets: boolean """ from loacore.conf import DB_TIMEOUT from loacore.utils.db import safe_commit, safe_execute freeling_sentences = [ sentence.compute_freeling_sentence() for sentence in sentences ] if freeling_modules is None: if _state_queue is not None: _state_queue.put( ProcessState(_id_process, os.getpid(), "Loading Freeling...", " - ")) morfo, tagger, sen, wsd = init_freeling() else: morfo, tagger, sen, wsd = freeling_modules _disambiguation_state(_state_queue, _id_process) # perform morphosyntactic analysis and disambiguation processed_sentences = morfo.analyze(freeling_sentences) processed_sentences = tagger.analyze(processed_sentences) # annotate and disambiguate senses processed_sentences = sen.analyze(processed_sentences) processed_sentences = wsd.analyze(processed_sentences) # Copy freeling results into our Words for s in range(len(sentences)): sentence = sentences[s] if not len(sentence.words) == len(processed_sentences[s]): print("/!\\ Warning, sentence offset error in synset_process /!\\") print(sentence.sentence_str()) print([w.get_form() for w in processed_sentences[s]]) for w in range(len(sentence.words)): word = sentence.words[w] rank = processed_sentences[s][w].get_senses() if len(rank) > 0: if not rank[0][0][0] == '8': # ignore synsets offsets 8.......-. # they are odd synsets that WordNet can't find... word.synset = Synset(None, word.id_word, rank[0][0], wn.of2ss(rank[0][0]).name(), None, None, None) if print_synsets: print("Word : " + word.word) print("Synset code : " + rank[0][0]) print("Synset name : " + wn.of2ss(rank[0][0]).name()) # Add synsets to database conn = sql.connect(DB_PATH, timeout=DB_TIMEOUT) c = conn.cursor() sentence_count = 0 total_sentence = len(sentences) for sentence in sentences: # Print state sentence_count += 1 _commit_state(_state_queue, _id_process, sentence_count, total_sentence) for word in sentence.words: synset = word.synset if synset is not None: # Add synset safe_execute( c, "INSERT INTO Synset (ID_Word, Synset_Code, Synset_Name) " "VALUES (?, ?, ?)", 0, _state_queue, _id_process, mark_args=(word.id_word, synset.synset_code, synset.synset_name)) # Get back id of last inserted review safe_execute(c, "SELECT last_insert_rowid()", 0, _state_queue, _id_process) id_synset = c.fetchone()[0] # Update Word table safe_execute( c, "UPDATE Word SET ID_Synset = " + str(id_synset) + " WHERE ID_Word = " + str(word.id_word), 0, _state_queue, _id_process) safe_commit(conn, 0, _state_queue, _id_process) conn.close()
def _disambiguation_state(state_queue, id_process): if state_queue is not None: state_queue.put( ProcessState(id_process, os.getpid(), "Disambiguation", "-")) else: print("Disambiguation", end="\n")
def _split_reviews_process(reviews, freeling_modules, _state_queue=None, _id_process=None, interrupt=None): try: import os from loacore.utils.status import ProcessState # Tokenization + Add all sentences and all words from all reviews import loacore.process.sentence_process as sentence_process added_sentences = sentence_process.add_sentences_from_reviews( reviews, _state_queue=_state_queue, _id_process=_id_process, freeling_modules=(freeling_modules["morfo"], freeling_modules["tk"], freeling_modules["sp"])) print(len(added_sentences)) # added_sentences = sentence_process.add_sentences_from_reviews( # reviews, # _state_queue=state_queue, # _id_process=id_process) # Reload sentences with words import loacore.load.sentence_load as sentence_load if _state_queue is not None: _state_queue.put( ProcessState(_id_process, os.getpid(), "Reload Sentences", "-")) else: print("Reload Sentences...") sentences = sentence_load.load_sentences( id_sentences=[s.id_sentence for s in added_sentences], load_words=True) # Some test outputs ############################################ from loacore.conf import OUTPUT_PATH f = open(os.path.join(OUTPUT_PATH, "test_sentence.txt"), 'w') f.write(str(len(sentences)) + "\n") for s in sentences: f.write(str(len(s.words)) + "\t" + s.sentence_str() + "\n") f.close() ################################################################# # Lemmatization import loacore.process.lemma_process as lemma_process lemma_process.add_lemmas_to_sentences( sentences, _state_queue=_state_queue, _id_process=_id_process, freeling_modules=freeling_modules["morfo"]) # lemma_process.add_lemmas_to_sentences( # sentences, # _state_queue=state_queue, # _id_process=id_process) # Disambiguation import loacore.process.synset_process as synset_process synset_process.add_synsets_to_sentences( sentences, _state_queue=_state_queue, _id_process=_id_process, freeling_modules=(freeling_modules["morfo"], freeling_modules["tagger"], freeling_modules["sen"], freeling_modules["wsd"])) # synset_process.add_synsets_to_sentences( # sentences, # _state_queue=state_queue, # _id_process=id_process) # Synset polarities id_words = [w.id_word for s in sentences for w in s.words] synset_process.add_polarity_to_synsets(id_words, _state_queue=_state_queue, _id_process=_id_process) # Dep tree import loacore.process.deptree_process as deptree_process deptree_process.add_dep_tree_from_sentences( sentences, _state_queue=_state_queue, _id_process=_id_process, freeling_modules=(freeling_modules["morfo"], freeling_modules["tagger"], freeling_modules["sen"], freeling_modules["wsd"], freeling_modules["parser"])) # deptree_process.add_dep_tree_from_sentences( # sentences, # _state_queue=state_queue, # _id_process=id_process) if _state_queue is not None: _state_queue.put( ProcessState(_id_process, os.getpid(), "Terminated", " - ")) except: from loacore.conf import OUTPUT_PATH import logging import os file = open(os.path.join(OUTPUT_PATH, str(os.getpid()) + ".txt")) file.close() logging.basicConfig( filename=os.path.join(OUTPUT_PATH, "error_log.out")) logging.exception("Process " + str(os.getpid()) + " interrupted.") if interrupt is not None: interrupt.put("error")
def _parsing_state(state_queue, state, id_process): if state_queue is not None: state_queue.put(ProcessState(id_process, os.getpid(), state, "-")) else: print(state, end="\n")
def add_dep_tree_from_sentences(sentences, print_result=False, _state_queue=None, _id_process=None, freeling_modules=None): """ Generates the dependency trees of the specified sentences and add the results to the database.\n Sentences are firstly converted into "raw" Freeling sentences (without any analysis) and then all the necessary Freeling processes are performed.\n The PoS_tag of words are also computed and added to the database in this function.\n .. note:: This function should be used only inside the :func:`file_process.add_files()` function. .. note:: This process can be quite long. (at least a few minutes) :param sentences: Sentences to process :type sentences: :obj:`list` of |Sentence| :param print_result: Print PoS_tags and labels associated to each |Word| :type print_result: boolean """ from loacore.utils.db import safe_commit, safe_execute from loacore.conf import DB_TIMEOUT if freeling_modules is None: if _state_queue is not None: _state_queue.put( ProcessState(_id_process, os.getpid(), "Loading Freeling...", " - ")) morfo, tagger, sen, wsd, parser = init_freeling() else: morfo, tagger, sen, wsd, parser = freeling_modules freeling_sentences = [ sentence.compute_freeling_sentence() for sentence in sentences ] # Print state _parsing_state(_state_queue, "DT Tagging...", _id_process) # perform morphosyntactic analysis processed_sentences = morfo.analyze(freeling_sentences) processed_sentences = tagger.analyze(processed_sentences) # Print state _parsing_state(_state_queue, "DT Disambiguation...", _id_process) # annotate and disambiguate senses processed_sentences = sen.analyze(processed_sentences) processed_sentences = wsd.analyze(processed_sentences) # Print state _parsing_state(_state_queue, "Dep Tree Parsing...", _id_process) # Dependency tree parsing processed_sentences = parser.analyze(processed_sentences) conn = sql.connect(DB_PATH, timeout=DB_TIMEOUT) c = conn.cursor() sentence_count = 0 total_sentence = len(sentences) for s in range(len(sentences)): # Print State sentence_count += 1 _commit_state(_state_queue, _id_process, sentence_count, total_sentence) sentence = sentences[s] # Add dep_tree to database dt = processed_sentences[s].get_dep_tree() dep_tree = DepTree(None, None, sentence.id_sentence) safe_execute(c, "INSERT INTO Dep_Tree (ID_Sentence) VALUES (?)", 0, _state_queue, _id_process, mark_args=[dep_tree.id_sentence]) # Get back id_dep_tree safe_execute(c, "SELECT last_insert_rowid()", 0, _state_queue, _id_process) id_dep_tree = c.fetchone()[0] dep_tree.id_dep_tree = id_dep_tree # Database process root = None if not len(sentence.words) == len(processed_sentences[s]): print( "/!\\ Warning, sentence offset error in deptree_process /!\\") print(sentence.sentence_str()) print([w.get_form() for w in processed_sentences[s]]) for w in range(len(sentence.words)): word = sentence.words[w] rank = processed_sentences[s][w].get_senses() if len(rank) > 0: word.PoS_tag = processed_sentences[s][w].get_tag() if print_result: print("Word : " + word.word) print("PoS_tag : " + processed_sentences[s][w].get_tag()) print("Label : " + dt.get_node_by_pos(w).get_label()) # We use the get_node_by_pos function to map the tree to our sentence node = dt.get_node_by_pos(w) dep_tree_node = DepTreeNode(None, id_dep_tree, word.id_word, node.get_label(), 0) if node == dt.begin(): dep_tree_node.root = 1 root = dep_tree_node # Add DepTreeNode to database safe_execute( c, "INSERT INTO Dep_Tree_Node (ID_Dep_Tree, ID_Word, Label, root) " "VALUES (?, ?, ?, ?)", 0, _state_queue, _id_process, mark_args=(dep_tree_node.id_dep_tree, dep_tree_node.id_word, dep_tree_node.label, dep_tree_node.root)) # Get back id_dep_tree_node safe_execute(c, "SELECT last_insert_rowid()", 0, _state_queue, _id_process) id_dep_tree_node = c.fetchone()[0] dep_tree_node.id_dep_tree_node = id_dep_tree_node # Use the freeling set_node_id function to store our db node id in the freeling node node.set_node_id(str(id_dep_tree_node)) # Add PoS_tag to Word if word.PoS_tag is not None: safe_execute( c, "UPDATE Word SET PoS_tag = '" + word.PoS_tag + "' " "WHERE ID_Word = " + str(word.id_word), 0, _state_queue, _id_process) # Add dep_tree root to database dep_tree.root = root safe_execute( c, "UPDATE Dep_Tree SET ID_Dep_Tree_Node = " + str(root.id_dep_tree_node) + " " "WHERE ID_Dep_Tree = " + str(id_dep_tree), 0, _state_queue, _id_process) # Add children relations root_node = dt.begin() _rec_children(c, root_node, _state_queue, _id_process) if _state_queue is None: print("") safe_commit(conn, 0, _state_queue, _id_process) conn.close()
def add_sentences_from_reviews(reviews, _state_queue=None, _id_process=None, freeling_modules=None): """ Performs the first Freeling process applied to each normalized review.\n Each review is tokenized, and then splitted into sentences, thanks to corresponding Freeling modules.\n A representation of the Sentences and their Words (tokens) are then added to corresponding tables. .. note:: This function should be used only inside the :func:`file_process.add_files()` function. :param reviews: Reviews to process :type reviews: :obj:`list` of |Review| :return: added sentences :rtype: :obj:`list` of |Sentence| """ from loacore.classes.classes import Word from loacore.utils.db import safe_commit, safe_execute from loacore.conf import DB_TIMEOUT if freeling_modules is None: if _state_queue is not None: _state_queue.put( ProcessState(_id_process, os.getpid(), "Loading Freeling...", " - ")) morfo, tk, sp = init_freeling() else: morfo, tk, sp = freeling_modules conn = sql.connect(DB_PATH, timeout=DB_TIMEOUT) c = conn.cursor() added_sentences = [] review_count = 0 try: total_review = len(reviews) except TypeError: # Review is a ReviewIterator, unkown length. total_review = " - " for review in reviews: # Print state review_count += 1 _tokenization_state(_state_queue, _id_process, review_count, total_review) raw_review = review.review tokens = tk.tokenize(raw_review) sentences = sp.split(tokens) sentences = morfo.analyze(sentences) review_index = 0 for sentence in sentences: if len(sentence) <= 50: review_sentence = Sentence(None, review.id_review, review_index, None) review_index += 1 # Add words sentence_index = 0 for word in sentence: review_sentence.words.append( Word(None, None, sentence_index, word.get_form(), None, None, None)) sentence_index += 1 review.sentences.append(review_sentence) sentence_count = 0 total_sentence = len([s for r in reviews for s in r.sentences]) for r in reviews: for s in r.sentences: # Print state sentence_count += 1 _commit_state(_state_queue, _id_process, sentence_count, total_sentence) # Add sentence safe_execute(c, "INSERT INTO Sentence (ID_Review, Review_Index) " "VALUES (?, ?)", 0, _state_queue, _id_process, mark_args=(s.id_review, s.review_index)) # Get back id of last inserted sentence safe_execute(c, "SELECT last_insert_rowid()", 0, _state_queue, _id_process) id_sentence = c.fetchone()[0] s.id_sentence = id_sentence sql_words = [] for w in s.words: w.id_sentence = id_sentence sql_words.append((id_sentence, w.sentence_index, w.word)) safe_execute( c, "INSERT INTO Word (ID_Sentence, Sentence_Index, word) VALUES (?, ?, ?)", 0, _state_queue, _id_process, mark_args=sql_words, execute_many=True) added_sentences.append(s) if _state_queue is None: print("") safe_commit(conn, 0, _state_queue, _id_process) conn.close() return added_sentences
def add_lemmas_to_sentences(sentences, print_lemmas=False, _state_queue=None, _id_process=None, freeling_modules=None): """ Performs a Freeling process to add lemmas to words.\n However, the argument is actually a sentence to better fit Freeling usage.\n Our sentences will be converted to a Freeling Sentences before processing. .. note:: This function should be used only inside the :func:`file_process.add_files()` function. :param sentences: Sentences to process :type sentences: :obj:`list` of |Sentence| :param print_lemmas: If True, print lemmatization results :type print_lemmas: boolean """ from loacore.utils.db import safe_commit, safe_execute from loacore.conf import DB_TIMEOUT freeling_sentences = [ sentence.compute_freeling_sentence() for sentence in sentences ] if freeling_modules is None: if _state_queue is not None: _state_queue.put( ProcessState(_id_process, os.getpid(), "Loading Freeling...", " - ")) morfo = init_freeling() else: morfo = freeling_modules # Print sentence _lemmatization_state(_state_queue, _id_process) processed_sentences = morfo.analyze(freeling_sentences) # Copy freeling results into our Words for s in range(len(sentences)): sentence = sentences[s] if not len(sentence.words) == len(processed_sentences[s]): print("/!\\ Warning, sentence offset error in lemma_process /!\\") print(sentence.sentence_str()) print([w.get_form() for w in processed_sentences[s]]) for w in range(len(sentence.words)): word = sentence.words[w] word.lemma = processed_sentences[s][w].get_lemma() if print_lemmas: print(word.word + " : " + word.lemma) # Add lemmas to database conn = sql.connect(DB_PATH, timeout=DB_TIMEOUT) c = conn.cursor() sentence_count = 0 total_sentence = len(sentences) _commit_state(_state_queue, _id_process, " - ", " - ") for sentence in sentences: # Print state sentence_count += 1 _commit_state(_state_queue, _id_process, sentence_count, total_sentence) for word in sentence.words: # Add Lemma to Lemma Table safe_execute(c, "INSERT INTO Lemma (Lemma, ID_Word) VALUES (?, ?)", 0, _state_queue, _id_process, mark_args=(word.lemma, word.id_word)) # Get back id of last inserted lemma safe_execute(c, "SELECT last_insert_rowid()", 0, _state_queue, _id_process) id_lemma = c.fetchone()[0] # Update Word table safe_execute( c, "UPDATE Word SET ID_Lemma = " + str(id_lemma) + " WHERE ID_Word = " + str(word.id_word), 0, _state_queue, _id_process) if _state_queue is None: print("") safe_commit(conn, 0, _state_queue, _id_process) conn.close()
def _lemmatization_state(state_queue, id_process): if state_queue is not None: state_queue.put( ProcessState(id_process, os.getpid(), "Lemmatization", "-")) else: print("Lemmatization", end="\n")