def method_3(words, bid, cap, threshold): try: updateStatus(bid, cap, "running") length = len(words) links = [] count_df = [] page_words = {} page_finder(words, page_words) # words is a dictionary in which each word is linked to the corresponding wiki page. count_concept(words, links, page_words) # guarda se una parola è nei link di un altra pagina, se si mette un 1 nell'array links, altrimenti mette 0 counter_df(links, count_df, length, words) # popola l'array count_df in cui ogni cella corrisponde al numero di volte in cui una parola appare nei link delle altre parole e viene utilizzato per calcolare il refD for concept in words: for word in [word for word in words if word != concept]: valueRefD = refD(concept, word, links, length, count_df, words) bs = Baseline_Methods.query.filter_by(bid=bid, cap=cap, lemma1=concept, lemma2=word).first() if not bs: bs = Baseline_Methods(bid=bid, cap=cap, lemma1=concept, lemma2=word, m3=valueRefD) db.session.add(bs) else: bs.m3 = valueRefD db.session.add(Bs_threshold(bid=bid, cap=cap, method=3,threshold=threshold)) db.session.commit() updateStatus(bid, cap, "succeeded") except: updateStatus(bid, cap, "failed") print("error:", sys.exc_info()) raise
def populateDb(missingRel, cosinDict, lernDict, bid, cap): #popoulate the db for key in missingRel: name = key.split("__") a = name[0] b = name[1] bs = Baseline_Methods.query.filter_by(bid=bid, cap=cap, lemma1=b, lemma2=a).first() if not bs: bs = Baseline_Methods(bid=bid, cap=cap, lemma1=b, lemma2=a, m4a=cosinDict[key], m4b=lernDict[key]) db.session.add(bs) else: bs.m4a = cosinDict[key] bs.m4b = lernDict[key]
def populate_db(self): for key, value in self.tocDistance.items(): concept = key.split("_")[0] lemma = key.split("_")[1] bs = Baseline_Methods.query.filter_by(bid=self.bid, cap=self.cap, lemma1=concept, lemma2=lemma).first() if not value: value = 0 if not bs: bs = Baseline_Methods(bid=self.bid, cap=self.cap, lemma1=concept, lemma2=lemma, m5=float(value)) db.session.add(bs) else: bs.m5 = float(value) db.session.commit()
def populate_db(self, words, bid, cap): """ loop inside words and create or update the corrisponding row in Baseline_methods table. The value of m2 is based on the presence of lemma2 inside pre_req[lemma1] :param words :param bid :param cap """ for concept in words: for lemma in [lemma for lemma in words if concept != lemma]: bs = Baseline_Methods.query.filter_by(bid=bid, cap=cap, lemma1=concept, lemma2=lemma).first() if not bs: if lemma in self.pre_req[concept]: try: phrase = int(self.phrase_id[concept + "_" + lemma]) except: phrase = 0 bs = Baseline_Methods(bid=bid, cap=cap, lemma1=concept, lemma2=lemma, m2=1, m2_sentence=phrase) db.session.add(bs) else: bs = Baseline_Methods(bid=bid, cap=cap, lemma1=concept, lemma2=lemma, m2=0, m2_sentence=0) db.session.add(bs) else: if lemma in self.pre_req[concept]: try: phrase = int(self.phrase_id[concept + "_" + lemma]) except: phrase = 0 bs.m2 = 1 bs.m2_sentence = phrase else: bs.m2 = 0 bs.m2_sentence = 0 db.session.commit()
def populate_db(self, words, bid, cap): """ loop inside words and create or update the corrisponding row in Baseline_methods table. The value of m1 is based on the presence of lemma2 inside pre_req[lemma1] :param words :param bid :param cap """ for concept in words: for lemma in [lemma for lemma in words if concept != lemma]: bs = Baseline_Methods.query.filter_by(bid=bid, cap=cap, lemma1=concept, lemma2=lemma).first() if not bs: if lemma in self.pre_req[concept]: bs = Baseline_Methods(bid=bid, cap=cap, lemma1=concept, lemma2=lemma, m1=1) db.session.add(bs) else: bs = Baseline_Methods(bid=bid, cap=cap, lemma1=concept, lemma2=lemma, m1=0) db.session.add(bs) else: if lemma in self.pre_req[concept]: bs.m1 = 1 elif not bs.m1: bs.m1 = 0 db.session.commit()
def method_6(self): """Launch Burst analysis""" try: # FIRST PHASE: extract bursts #print("Extracting bursts...\n") burst_extr = BurstExtractor(text=self.text, wordlist=self.words) burst_extr.find_offsets(occ_index_file=None) burst_extr.generate_bursts(s=self.S, gamma=self.GAMMA) burst_extr.filter_bursts(level=self.LEVEL, save_monolevel_keywords=True, replace_original_results=True) burst_extr.break_bursts(burst_length=30, num_occurrences=3, replace_original_results=True) burst_res = burst_extr.bursts if burst_res.empty: raise ValueError( "The chosen parameters do not produce results") # obtain json with first, last, ongoing, unique tags bursts_json = burst_proc.get_json_with_bursts( burst_res, self.occurrences) # SECOND PHASE: detect relations between bursts and assign weights to them #print("Detecting Allen's relations and assign weights to burst pairs...\n") weight_assigner = WeightAssigner( bursts=burst_res, relations_weights=self.ALLEN_WEIGHTS) weight_assigner.detect_relations( max_gap=self.MAX_GAP, alpha=0.05, find_also_inverse=self.USE_INVERSES) # output data for the gantt interface and ML projects burst_pairs_df = weight_assigner.burst_pairs bursts_weights = weight_assigner.bursts_weights # THIRD PHASE: normalize the bursts' weights #print("Normalizing the matrix with weights of burst pairs...\n") weight_norm = WeightsNormalizer(bursts=burst_res, burst_pairs=burst_pairs_df, burst_weight_matrix=bursts_weights) weight_norm.normalize(formula=self.NORM_FORMULA, occ_index_file=self.occurrences) burst_norm = weight_norm.burst_norm.round(decimals=3) # FINAL STEP: give directionality to bursts #print("Giving directionality to the concept matrix built with bursts...\n") directed_burst = burst_proc.give_direction_using_first_burst( undirected_matrix=burst_norm, bursts_results=burst_res, indexes=self.occurrences, level=self.LEVEL, preserve_relations=self.PRESERVE_RELATIONS) # add rows and columns in the matrices for possible discarded terms #print("\nAdding rows and columns for missing concepts in the burst matrix...\n") missing_terms = [ term for term in self.words if term not in directed_burst.index ] for term in missing_terms: directed_burst.loc[term] = 0 directed_burst[term] = 0 #print("Shape of final directed burst matrix:", directed_burst.shape) # get an edgelist with the extracted prerequisite relations #print("Getting an edgelist with the extracted prerequisite relations...\n") sorted_edgelist = pd.DataFrame( burst_proc.to_edgelist(directed_burst), columns=["prerequisite", "target", "weight"]) ### SALVATAGGIO DATI IN DATABASE # salvo risultati for row in sorted_edgelist.itertuples(): bs = Baseline_Methods.query.filter_by( bid=self.bid, cap=self.cap, lemma1=row.target, lemma2=row.prerequisite).first() if not bs: bs = Baseline_Methods(bid=self.bid, cap=self.cap, lemma1=row.target, lemma2=row.prerequisite, m6=row.weight) db.session.add(bs) else: bs.m6 = row.weight # salvo i parametri usati params = Burst_params.query.filter_by(bid=self.bid, cap=self.cap).first() if not params: params = Burst_params(bid=self.bid, cap=self.cap, s=self.S, gamma=self.GAMMA, level=self.LEVEL) db.session.add(params) else: params.s = self.S params.gamma = self.GAMMA params.level = self.LEVEL for typ in self.ALLEN_WEIGHTS: allen = Burst_params_allen.query.filter_by(bid=self.bid, cap=self.cap, type=typ).first() if not allen: allen = Burst_params_allen(bid=self.bid, cap=self.cap, type=typ, weight=self.ALLEN_WEIGHTS[typ]) db.session.add(allen) else: allen.weight = self.ALLEN_WEIGHTS[typ] # salvo burst results old_bursts = Burst_results.query.filter_by(bid=self.bid, cap=self.cap).all() for old in old_bursts: db.session.delete(old) for burst in bursts_json: b = Burst_results.query.filter_by(burst_id=burst["ID"], bid=self.bid, cap=self.cap).first() if not b: b = Burst_results(burst_id=burst["ID"], bid=self.bid, cap=self.cap, lemma=burst["concept"], start=burst["startSent"], end=burst["endSent"], freq=burst["freqOfTerm"], status=burst["status"]) db.session.add(b) else: b.lemma = burst["concept"] b.start = burst["startSent"] b.end = burst["endSent"] b.freq = burst["freqOfTerm"] b.status = burst["status"] # salvo relazioni tra le coppie di burst old_bursts_pairs = Burst_rel_allen.query.filter_by( bid=self.bid, cap=self.cap).all() for old in old_bursts_pairs: db.session.delete(old) for burst_pair in burst_pairs_df.itertuples(): b = Burst_rel_allen.query.filter_by( bid=self.bid, cap=self.cap, burst1=burst_pair.Bx_id, burst2=burst_pair.By_id).first() if not b: b = Burst_rel_allen(bid=self.bid, cap=self.cap, burst1=burst_pair.Bx_id, burst2=burst_pair.By_id, type=burst_pair.Rel) db.session.add(b) else: b.type = burst_pair.Rel db.session.commit() self.updateStatus("modifiable") except ValueError as e: print("error:", sys.exc_info()) self.updateStatus("failed") raise e
def method_4(words, bid, cap): try: updateStatus(bid, cap, "running") missingRel = [] page_words = {} lernDict = {} cosinDict = {} wiki_backlinks = {} page_finder(words, page_words, wiki_backlinks) print("pagine trovate") result = topic_model(page_words) ldamodel = result[0] doc_term_matrix = result[1] for a in list(page_words.keys()): for b in [x for x in list(page_words.keys()) if x != a]: if (usage_definition(a, b, page_words)): bs = Baseline_Methods.query.filter_by(bid=bid, cap=cap, lemma1=b, lemma2=a).first() if not bs: bs = Baseline_Methods(bid=bid, cap=cap, lemma1=b, lemma2=a, m4=1) db.session.add(bs) else: bs.m4 = 1 else: #inLinksDiff = in_links(a, b, page_words) inLinksDiff = in_links(a, b, wiki_backlinks) outLinksDiff = out_links(a, b, page_words) topicCovDiff = entropy(a, b, ldamodel, page_words, doc_term_matrix) contentSim = cosinesim(a, b, page_words) if (outLinksDiff == 0): learnLevelDiff = topicCovDiff else: learnLevelDiff = inLinksDiff / outLinksDiff + topicCovDiff valuet = {a + "__" + b: learnLevelDiff} lernDict.update(valuet) valuet = {a + "__" + b: contentSim} cosinDict.update(valuet) missingRel.append(a + "__" + b) #definire threshold1 e threshold2 fisse ne codice #se superano queste threshold settare m4 a 1 # if (learnLevelDiff > treshold1 and contentSim > treshold2): # pre_req[b].append(a) db.session.commit() normalize(cosinDict) normalize(lernDict) populateDb(missingRel, cosinDict, lernDict, bid, cap) updateStatus(bid, cap, "succeeded") except: updateStatus(bid, cap, "failed") print("error:", sys.exc_info()) raise