def _get_vectors_container(self, processed_sentences): concatenated = [ " ".join(s) + f" __label__{i}\n" for i, s in enumerate(processed_sentences) ] with open("tmp_sentences.txt", "w") as f_out: f_out.writelines(concatenated) arg = sw.args() arg.trainFile = "tmp_sentences.txt" arg.trainMode = 0 arg.lr = self.learning_rate arg.epoch = self.num_epochs arg.dim = self.embedding_size arg.similarity = "dot" sp = sw.starSpace(arg) sp.init() sp.train() sp.saveModelTsv("tmp_embeds.tsv") vectors_container = {} with open("tmp_embeds.tsv", "r") as f_in: for line in f_in: if "__label__" not in line: split_line = line.split() key = split_line[0] embedding = np.array( [float(num) for num in split_line[1:]]) vectors_container[key] = embedding os.remove("tmp_sentences.txt") os.remove("tmp_embeds.tsv") return vectors_container
def case_ranker(q, file_lists): arg = sw.args() arg.trainFile = './input.txt' arg.testFile = './input.txt' arg.trainMode = 5 test_dir = 'query4/data/ranker/All_FT/' sp = sw.starSpace(arg) sp.init() MIN_SENTENCE_LEN = 10 def get_sentences(fp): doc = fp.read() all_sentences = doc.split('.') sentences = [] for s in all_sentences: if (len(s) >= MIN_SENTENCE_LEN): sentences.append(s) return sentences query = q qvec = np.array(sp.getDocVector(query, ' ')) doc_score = {} for file in file_lists: fp = open('query4/data/ranker/All_FT/' + file) sentences = get_sentences(fp) if (len(sentences) == 0): doc_score[file] = 0 continue # print("here") sc = 0 for s in sentences: vec = np.array(sp.getDocVector(s, ' ')) curr_sc = cosine_similarity(qvec, vec) sc += curr_sc[0][0] sc = sc / len(sentences) # print("sc = ", sc) doc_score[file] = sc return doc_score
import starwrap as sw import numpy as np import pandas as pd import re from nltk.tokenize.toktok import ToktokTokenizer arg = sw.args() arg.trainMode = 0 sp = sw.starSpace(arg) #sp.initFromSavedModel('scientificn1') sp.initFromTsv('sci-mc5-tm5.tsv') def tokenize(text, tt): text = " ".join(tt.tokenize(text)) # Adapted from toktok PERIOD_AND_SPACE_1 = re.compile(r"(?<!\.)\.\s"), r" . " PERIOD_AND_SPACE_2 = re.compile( r"""(?<!\.)\.\s*(["'’»›”]) *\s"""), r" . \1" ONE_SPACE = re.compile(r" {2,}"), " " RXS = [PERIOD_AND_SPACE_1, PERIOD_AND_SPACE_2, ONE_SPACE] for regexp, subsitution in RXS: text = regexp.sub(subsitution, text) return text.strip().lower()