예제 #1
0
파일: Method_03.py 프로젝트: Teldh/PRET
def method_3(words, bid, cap, threshold):
    try:
        updateStatus(bid, cap, "running")

        length = len(words)
        links = []
        count_df = []
        page_words = {}

        page_finder(words, page_words) # words is a dictionary in which each word is linked to the corresponding wiki page.
        count_concept(words, links, page_words) # guarda se una parola è nei link di un altra pagina, se si mette un 1 nell'array links, altrimenti mette 0
        counter_df(links, count_df, length, words) # popola l'array count_df in cui ogni cella corrisponde al numero di volte in cui una parola appare nei link delle altre parole e viene utilizzato per calcolare il refD
        for concept in words:
            for word in [word for word in words if word != concept]:
                valueRefD = refD(concept, word, links, length, count_df, words)
                bs = Baseline_Methods.query.filter_by(bid=bid, cap=cap, lemma1=concept, lemma2=word).first()
                if not bs:
                   bs = Baseline_Methods(bid=bid, cap=cap, lemma1=concept, lemma2=word, m3=valueRefD)
                   db.session.add(bs)
                else:
                    bs.m3 = valueRefD


        db.session.add(Bs_threshold(bid=bid, cap=cap, method=3,threshold=threshold))

        db.session.commit()
        updateStatus(bid, cap, "succeeded")
    except:
        updateStatus(bid, cap, "failed")
        print("error:", sys.exc_info())
        raise
예제 #2
0
파일: Method_04.py 프로젝트: Teldh/PRET
def populateDb(missingRel, cosinDict, lernDict, bid, cap):  #popoulate the db
    for key in missingRel:
        name = key.split("__")
        a = name[0]
        b = name[1]
        bs = Baseline_Methods.query.filter_by(bid=bid,
                                              cap=cap,
                                              lemma1=b,
                                              lemma2=a).first()
        if not bs:
            bs = Baseline_Methods(bid=bid,
                                  cap=cap,
                                  lemma1=b,
                                  lemma2=a,
                                  m4a=cosinDict[key],
                                  m4b=lernDict[key])
            db.session.add(bs)
        else:
            bs.m4a = cosinDict[key]
            bs.m4b = lernDict[key]
예제 #3
0
    def populate_db(self):
        for key, value in self.tocDistance.items():

            concept = key.split("_")[0]
            lemma = key.split("_")[1]
            bs = Baseline_Methods.query.filter_by(bid=self.bid,
                                                  cap=self.cap,
                                                  lemma1=concept,
                                                  lemma2=lemma).first()
            if not value:
                value = 0
            if not bs:
                bs = Baseline_Methods(bid=self.bid,
                                      cap=self.cap,
                                      lemma1=concept,
                                      lemma2=lemma,
                                      m5=float(value))
                db.session.add(bs)
            else:
                bs.m5 = float(value)
        db.session.commit()
예제 #4
0
파일: Method_02.py 프로젝트: Teldh/PRET
    def populate_db(self, words, bid, cap):
        """ loop inside words and create or update the corrisponding row in Baseline_methods table.
            The value of m2 is based on the presence of lemma2 inside pre_req[lemma1]

            :param words
            :param bid
            :param cap

        """
        for concept in words:
            for lemma in [lemma for lemma in words if concept != lemma]:
                bs = Baseline_Methods.query.filter_by(bid=bid,
                                                      cap=cap,
                                                      lemma1=concept,
                                                      lemma2=lemma).first()
                if not bs:
                    if lemma in self.pre_req[concept]:
                        try:
                            phrase = int(self.phrase_id[concept + "_" + lemma])
                        except:
                            phrase = 0
                        bs = Baseline_Methods(bid=bid,
                                              cap=cap,
                                              lemma1=concept,
                                              lemma2=lemma,
                                              m2=1,
                                              m2_sentence=phrase)
                        db.session.add(bs)
                    else:
                        bs = Baseline_Methods(bid=bid,
                                              cap=cap,
                                              lemma1=concept,
                                              lemma2=lemma,
                                              m2=0,
                                              m2_sentence=0)
                        db.session.add(bs)
                else:
                    if lemma in self.pre_req[concept]:
                        try:
                            phrase = int(self.phrase_id[concept + "_" + lemma])
                        except:
                            phrase = 0
                        bs.m2 = 1
                        bs.m2_sentence = phrase
                    else:
                        bs.m2 = 0
                        bs.m2_sentence = 0

        db.session.commit()
예제 #5
0
    def populate_db(self, words, bid, cap):
        """
        loop inside words and create or update the corrisponding row in Baseline_methods table. The value of m1 is based
        on the presence of lemma2 inside pre_req[lemma1]

        :param words
        :param bid
        :param cap

        """
        for concept in words:
            for lemma in [lemma for lemma in words if concept != lemma]:
                bs = Baseline_Methods.query.filter_by(bid=bid,
                                                      cap=cap,
                                                      lemma1=concept,
                                                      lemma2=lemma).first()
                if not bs:
                    if lemma in self.pre_req[concept]:
                        bs = Baseline_Methods(bid=bid,
                                              cap=cap,
                                              lemma1=concept,
                                              lemma2=lemma,
                                              m1=1)
                        db.session.add(bs)
                    else:
                        bs = Baseline_Methods(bid=bid,
                                              cap=cap,
                                              lemma1=concept,
                                              lemma2=lemma,
                                              m1=0)
                        db.session.add(bs)
                else:
                    if lemma in self.pre_req[concept]:
                        bs.m1 = 1
                    elif not bs.m1:
                        bs.m1 = 0
        db.session.commit()
예제 #6
0
파일: Method_06.py 프로젝트: Teldh/PRET
    def method_6(self):
        """Launch Burst analysis"""
        try:
            # FIRST PHASE: extract bursts
            #print("Extracting bursts...\n")

            burst_extr = BurstExtractor(text=self.text, wordlist=self.words)
            burst_extr.find_offsets(occ_index_file=None)
            burst_extr.generate_bursts(s=self.S, gamma=self.GAMMA)
            burst_extr.filter_bursts(level=self.LEVEL,
                                     save_monolevel_keywords=True,
                                     replace_original_results=True)
            burst_extr.break_bursts(burst_length=30,
                                    num_occurrences=3,
                                    replace_original_results=True)
            burst_res = burst_extr.bursts

            if burst_res.empty:
                raise ValueError(
                    "The chosen parameters do not produce results")

            # obtain json with first, last, ongoing, unique tags
            bursts_json = burst_proc.get_json_with_bursts(
                burst_res, self.occurrences)

            # SECOND PHASE: detect relations between bursts and assign weights to them
            #print("Detecting Allen's relations and assign weights to burst pairs...\n")
            weight_assigner = WeightAssigner(
                bursts=burst_res, relations_weights=self.ALLEN_WEIGHTS)
            weight_assigner.detect_relations(
                max_gap=self.MAX_GAP,
                alpha=0.05,
                find_also_inverse=self.USE_INVERSES)
            # output data for the gantt interface and ML projects
            burst_pairs_df = weight_assigner.burst_pairs

            bursts_weights = weight_assigner.bursts_weights

            # THIRD PHASE: normalize the bursts' weights
            #print("Normalizing the matrix with weights of burst pairs...\n")
            weight_norm = WeightsNormalizer(bursts=burst_res,
                                            burst_pairs=burst_pairs_df,
                                            burst_weight_matrix=bursts_weights)
            weight_norm.normalize(formula=self.NORM_FORMULA,
                                  occ_index_file=self.occurrences)

            burst_norm = weight_norm.burst_norm.round(decimals=3)

            # FINAL STEP: give directionality to bursts
            #print("Giving directionality to the concept matrix built with bursts...\n")

            directed_burst = burst_proc.give_direction_using_first_burst(
                undirected_matrix=burst_norm,
                bursts_results=burst_res,
                indexes=self.occurrences,
                level=self.LEVEL,
                preserve_relations=self.PRESERVE_RELATIONS)

            # add rows and columns in the matrices for possible discarded terms
            #print("\nAdding rows and columns for missing concepts in the burst matrix...\n")
            missing_terms = [
                term for term in self.words if term not in directed_burst.index
            ]

            for term in missing_terms:
                directed_burst.loc[term] = 0
                directed_burst[term] = 0

            #print("Shape of final directed burst matrix:", directed_burst.shape)

            # get an edgelist with the extracted prerequisite relations
            #print("Getting an edgelist with the extracted prerequisite relations...\n")
            sorted_edgelist = pd.DataFrame(
                burst_proc.to_edgelist(directed_burst),
                columns=["prerequisite", "target", "weight"])

            ### SALVATAGGIO DATI IN DATABASE

            # salvo risultati
            for row in sorted_edgelist.itertuples():
                bs = Baseline_Methods.query.filter_by(
                    bid=self.bid,
                    cap=self.cap,
                    lemma1=row.target,
                    lemma2=row.prerequisite).first()
                if not bs:
                    bs = Baseline_Methods(bid=self.bid,
                                          cap=self.cap,
                                          lemma1=row.target,
                                          lemma2=row.prerequisite,
                                          m6=row.weight)
                    db.session.add(bs)
                else:
                    bs.m6 = row.weight

            # salvo i parametri usati
            params = Burst_params.query.filter_by(bid=self.bid,
                                                  cap=self.cap).first()
            if not params:
                params = Burst_params(bid=self.bid,
                                      cap=self.cap,
                                      s=self.S,
                                      gamma=self.GAMMA,
                                      level=self.LEVEL)
                db.session.add(params)
            else:
                params.s = self.S
                params.gamma = self.GAMMA
                params.level = self.LEVEL

            for typ in self.ALLEN_WEIGHTS:
                allen = Burst_params_allen.query.filter_by(bid=self.bid,
                                                           cap=self.cap,
                                                           type=typ).first()
                if not allen:
                    allen = Burst_params_allen(bid=self.bid,
                                               cap=self.cap,
                                               type=typ,
                                               weight=self.ALLEN_WEIGHTS[typ])
                    db.session.add(allen)
                else:
                    allen.weight = self.ALLEN_WEIGHTS[typ]

            # salvo burst results
            old_bursts = Burst_results.query.filter_by(bid=self.bid,
                                                       cap=self.cap).all()

            for old in old_bursts:
                db.session.delete(old)

            for burst in bursts_json:
                b = Burst_results.query.filter_by(burst_id=burst["ID"],
                                                  bid=self.bid,
                                                  cap=self.cap).first()

                if not b:
                    b = Burst_results(burst_id=burst["ID"],
                                      bid=self.bid,
                                      cap=self.cap,
                                      lemma=burst["concept"],
                                      start=burst["startSent"],
                                      end=burst["endSent"],
                                      freq=burst["freqOfTerm"],
                                      status=burst["status"])
                    db.session.add(b)
                else:
                    b.lemma = burst["concept"]
                    b.start = burst["startSent"]
                    b.end = burst["endSent"]
                    b.freq = burst["freqOfTerm"]
                    b.status = burst["status"]

            # salvo relazioni tra le coppie di burst
            old_bursts_pairs = Burst_rel_allen.query.filter_by(
                bid=self.bid, cap=self.cap).all()

            for old in old_bursts_pairs:
                db.session.delete(old)

            for burst_pair in burst_pairs_df.itertuples():
                b = Burst_rel_allen.query.filter_by(
                    bid=self.bid,
                    cap=self.cap,
                    burst1=burst_pair.Bx_id,
                    burst2=burst_pair.By_id).first()

                if not b:
                    b = Burst_rel_allen(bid=self.bid,
                                        cap=self.cap,
                                        burst1=burst_pair.Bx_id,
                                        burst2=burst_pair.By_id,
                                        type=burst_pair.Rel)

                    db.session.add(b)
                else:
                    b.type = burst_pair.Rel

            db.session.commit()
            self.updateStatus("modifiable")
        except ValueError as e:
            print("error:", sys.exc_info())
            self.updateStatus("failed")
            raise e
예제 #7
0
파일: Method_04.py 프로젝트: Teldh/PRET
def method_4(words, bid, cap):
    try:
        updateStatus(bid, cap, "running")

        missingRel = []
        page_words = {}
        lernDict = {}
        cosinDict = {}
        wiki_backlinks = {}

        page_finder(words, page_words, wiki_backlinks)

        print("pagine trovate")
        result = topic_model(page_words)
        ldamodel = result[0]
        doc_term_matrix = result[1]

        for a in list(page_words.keys()):
            for b in [x for x in list(page_words.keys()) if x != a]:
                if (usage_definition(a, b, page_words)):
                    bs = Baseline_Methods.query.filter_by(bid=bid,
                                                          cap=cap,
                                                          lemma1=b,
                                                          lemma2=a).first()
                    if not bs:
                        bs = Baseline_Methods(bid=bid,
                                              cap=cap,
                                              lemma1=b,
                                              lemma2=a,
                                              m4=1)
                        db.session.add(bs)
                    else:
                        bs.m4 = 1
                else:
                    #inLinksDiff = in_links(a, b, page_words)
                    inLinksDiff = in_links(a, b, wiki_backlinks)
                    outLinksDiff = out_links(a, b, page_words)
                    topicCovDiff = entropy(a, b, ldamodel, page_words,
                                           doc_term_matrix)
                    contentSim = cosinesim(a, b, page_words)
                    if (outLinksDiff == 0):
                        learnLevelDiff = topicCovDiff
                    else:
                        learnLevelDiff = inLinksDiff / outLinksDiff + topicCovDiff
                    valuet = {a + "__" + b: learnLevelDiff}
                    lernDict.update(valuet)
                    valuet = {a + "__" + b: contentSim}
                    cosinDict.update(valuet)
                    missingRel.append(a + "__" + b)
                    #definire threshold1 e threshold2 fisse ne codice
                    #se superano queste threshold settare m4 a 1
    #               if (learnLevelDiff > treshold1 and contentSim > treshold2):
    #                pre_req[b].append(a)
            db.session.commit()
        normalize(cosinDict)
        normalize(lernDict)
        populateDb(missingRel, cosinDict, lernDict, bid, cap)

        updateStatus(bid, cap, "succeeded")
    except:
        updateStatus(bid, cap, "failed")
        print("error:", sys.exc_info())
        raise