Esempio n. 1
0
    def _get_vectors_container(self, processed_sentences):
        concatenated = [
            " ".join(s) + f" __label__{i}\n"
            for i, s in enumerate(processed_sentences)
        ]

        with open("tmp_sentences.txt", "w") as f_out:
            f_out.writelines(concatenated)

        arg = sw.args()
        arg.trainFile = "tmp_sentences.txt"
        arg.trainMode = 0
        arg.lr = self.learning_rate
        arg.epoch = self.num_epochs
        arg.dim = self.embedding_size
        arg.similarity = "dot"
        sp = sw.starSpace(arg)
        sp.init()
        sp.train()
        sp.saveModelTsv("tmp_embeds.tsv")

        vectors_container = {}
        with open("tmp_embeds.tsv", "r") as f_in:
            for line in f_in:
                if "__label__" not in line:
                    split_line = line.split()
                    key = split_line[0]
                    embedding = np.array(
                        [float(num) for num in split_line[1:]])
                    vectors_container[key] = embedding

        os.remove("tmp_sentences.txt")
        os.remove("tmp_embeds.tsv")

        return vectors_container
Esempio n. 2
0
def case_ranker(q, file_lists):
    arg = sw.args()
    arg.trainFile = './input.txt'
    arg.testFile = './input.txt'
    arg.trainMode = 5
    test_dir = 'query4/data/ranker/All_FT/'

    sp = sw.starSpace(arg)
    sp.init()

    MIN_SENTENCE_LEN = 10

    def get_sentences(fp):
        doc = fp.read()
        all_sentences = doc.split('.')
        sentences = []
        for s in all_sentences:
            if (len(s) >= MIN_SENTENCE_LEN):
                sentences.append(s)
        return sentences

    query = q
    qvec = np.array(sp.getDocVector(query, ' '))
    doc_score = {}

    for file in file_lists:
        fp = open('query4/data/ranker/All_FT/' + file)
        sentences = get_sentences(fp)

        if (len(sentences) == 0):
            doc_score[file] = 0
            continue
    #   print("here")
        sc = 0
        for s in sentences:
            vec = np.array(sp.getDocVector(s, ' '))
            curr_sc = cosine_similarity(qvec, vec)
            sc += curr_sc[0][0]

        sc = sc / len(sentences)
        # print("sc = ", sc)
        doc_score[file] = sc
    return doc_score
import starwrap as sw
import numpy as np
import pandas as pd
import re
from nltk.tokenize.toktok import ToktokTokenizer

arg = sw.args()
arg.trainMode = 0

sp = sw.starSpace(arg)

#sp.initFromSavedModel('scientificn1')
sp.initFromTsv('sci-mc5-tm5.tsv')


def tokenize(text, tt):
    text = " ".join(tt.tokenize(text))

    # Adapted from toktok
    PERIOD_AND_SPACE_1 = re.compile(r"(?<!\.)\.\s"), r" . "
    PERIOD_AND_SPACE_2 = re.compile(
        r"""(?<!\.)\.\s*(["'’»›”]) *\s"""), r" . \1"
    ONE_SPACE = re.compile(r" {2,}"), " "
    RXS = [PERIOD_AND_SPACE_1, PERIOD_AND_SPACE_2, ONE_SPACE]

    for regexp, subsitution in RXS:
        text = regexp.sub(subsitution, text)

    return text.strip().lower()