def __init__(self, file, label_type, threshold): with open('public_data/vocab/word2id.pkl', 'rb') as infile: self.word2id = pickle.load(infile) with open('public_data/stats/stats_train.pkl', 'rb') as stats: stats = pickle.load(stats) self.orig_labels = [label for label, freq in stats["DISTR_" + label_type].items() if freq >= threshold] self.label_type = label_type with open("public_data/inputs/binary_%s_%s_%s_%s.pkl" % (file, label_type, str(threshold), "triplet"),'rb') as indata: self.df = pd.read_pickle(indata, compression=None)[:TRAIN_LIMIT] self.sequences_A = [torch.LongTensor([self.word2id.get(word, 1) for word in tokenize_and_clean(text)]) for text in self.df["TEXT_A"]] self.sequences_A = pad_sequence(self.sequences_A, batch_first=True) self.sequences_B = [torch.LongTensor([self.word2id.get(word, 1) for word in tokenize_and_clean(text)]) for text in self.df["TEXT_B"]] self.sequences_B = pad_sequence(self.sequences_B, batch_first=True) self.sequences_C = [torch.LongTensor([self.word2id.get(word, 1) for word in tokenize_and_clean(text)]) for text in self.df["TEXT_C"]] self.sequences_C = pad_sequence(self.sequences_C, batch_first=True) self.targets = torch.FloatTensor(np.ones((len(self.df))))
def compute_tfidf(collection, queries, idf_base): """ Computes tfidf vectors :param collection: all collection documents :param queries: all query documents :param idf_base: store idf with identifier "above_threshold", "below_threshold", "all" :return: document vectors for documents in collection and queries """ vectorizer = TfidfVectorizer(analyzer='word', tokenizer=dummy_fun, preprocessor=dummy_fun, token_pattern=None) texts = list(collection["TEXT"].apply(lambda x: tokenize_and_clean(x))) collection = vectorizer.fit_transform(texts) collection = normalize(csr_matrix(collection, dtype=np.float32).toarray(), copy=True) texts = list(queries["TEXT"].apply(lambda x: tokenize_and_clean(x))) queries = vectorizer.transform(texts) queries = normalize(csr_matrix(queries, dtype=np.float32).toarray(), copy=True) max_idf = max(vectorizer.idf_) word2weight = defaultdict(lambda: max_idf, [(w, vectorizer.idf_[i]) for w, i in vectorizer.vocabulary_.items()]) with open("public_data/vocab/tf_idf_word2weight_%s.pkl" % idf_base, 'wb') as out: pickle.dump(dict(word2weight), out, protocol=4) return collection, queries
def make_txt_file(file): """ Turns documents from dataframe into txt file as input to unsupervised embedding training :param file: pickled dataframe """ with open(file, 'rb') as infile: df = pd.read_pickle(infile, compression=None) texts = df["TEXT"].apply(lambda x: tokenize_and_clean(x)) with open("public_data/inputs/data.txt", "w", encoding="utf-8") as out: for text in texts: out.write(" ".join(text) + "\n")
def get_tokens_vocab(self): print( str(datetime.datetime.now()).split('.')[0], "Extracting tokens and vocab...") all_tokens = [] seq_lens = [] tokens = self.dataset["TEXT"].apply(lambda x: tokenize_and_clean(x)) for token_list in tokens: all_tokens.extend(token_list) seq_lens.append(len(token_list)) self.stats["TOKENS"] = all_tokens self.stats["VOCAB"] = list(set(all_tokens)) self.stats["TOKEN_FREQS"] = Counter(all_tokens) self.stats["AVG_TEXT_LEN"] = sum(seq_lens) / len(seq_lens)
def __init__(self, file, label_type, threshold, which_labels): self.label_type = label_type with open('public_data/vocab/word2id.pkl', 'rb') as infile: self.word2id = pickle.load(infile) with open('public_data/inputs/%s.pkl' %file, 'rb') as infile: data = pd.read_pickle(infile, compression=None) if file == "train": self.df = data[:TRAIN_LIMIT] elif file in ["valid", "test"]: self.df = data[:VALID_LIMIT] with open('public_data/stats/stats_train.pkl', 'rb') as stats: stats = pickle.load(stats) if which_labels in ["above_threshold", "below_threshold"]: if which_labels == "above_threshold": labels = [label for label, freq in stats["DISTR_" + label_type].items() if freq >= threshold] elif which_labels == "below_threshold": labels = [label for label, freq in stats["DISTR_" + label_type].items() if freq < threshold] self.df = self.df[self.df[label_type].isin(labels)] if file == "train": self.df = self.df[:CORPUS_LIMIT] elif file in ["valid", "test"]: self.df = self.df[:QUERY_LIMIT] self.sequences = [torch.LongTensor([self.word2id.get(word, 1) for word in tokenize_and_clean(text)]) for text in self.df["TEXT"]] self.sequences = pad_sequence(self.sequences, batch_first=True)
def __init__(self, file, label_type, threshold): with open('public_data/vocab/word2id.pkl', 'rb') as infile: self.word2id = pickle.load(infile) with open('public_data/stats/stats_train.pkl', 'rb') as stats: stats = pickle.load(stats) d = stats["DISTR_" + label_type] labels = [label for label, freq in sorted(d.items(), key=lambda item: item[1], reverse=True) if freq >= threshold] self.label2id = {l: i for i, l in enumerate(labels)} with open('public_data/inputs/%s.pkl' %file, 'rb') as indata: if file == "train": data = pd.read_pickle(indata, compression=None)[:TRAIN_LIMIT] elif file in ["valid", "test"]: data = pd.read_pickle(indata, compression=None)[:VALID_LIMIT] self.df = data[data[label_type].isin(labels)] self.sequences = [torch.LongTensor([self.word2id.get(word, 1) for word in tokenize_and_clean(text)]) for text in self.df["TEXT"]] self.sequences = pad_sequence(self.sequences, batch_first=True) self.labels = torch.LongTensor([self.label2id[label] for label in self.df[label_type]])
def get_collection_and_queries(self): if self.experiment == "TF-IDF": collection, queries = compute_tfidf(collection=self.collection.df, queries=self.queries.df, idf_base=self.which_labels) elif self.experiment == "BM25": queries = list( self.queries.df["TEXT"].apply(lambda x: tokenize_and_clean(x))) collection = list(self.collection.df["TEXT"].apply( lambda x: tokenize_and_clean(x))) elif self.experiment == "RANDOM_DOC": queries = normalize(np.random.uniform( low=-0.1, high=0.1, size=(len(self.queries), 300)).astype("float32"), axis=1) collection = normalize(np.random.uniform( low=-0.1, high=0.1, size=(len(self.collection), 300)).astype("float32"), axis=1) else: collection_loader = DataLoader(self.collection, batch_size=64, shuffle=False) queries_loader = DataLoader(self.queries, batch_size=64, shuffle=False) # without additional classifier if "mucl" not in self.experiment and "pair" not in self.experiment and "triplet" not in self.experiment: queries = combine_avg_doc_embs(sequences=queries_loader, model=self.experiment, idf_base="all") collection = combine_avg_doc_embs(sequences=collection_loader, model=self.experiment, idf_base="all") else: # with classifier if self.doc_repr == "avg": queries = combine_avg_doc_embs(sequences=queries_loader, model=self.model, idf_base=self.which_labels) collection = combine_avg_doc_embs( sequences=collection_loader, model=self.model, idf_base=self.which_labels) elif self.doc_repr in ["hidden", "linear", "softmax"]: collection = extract_from_model( doc_repr=self.doc_repr, sequences=collection_loader, model=self.model) queries = extract_from_model(doc_repr=self.doc_repr, sequences=queries_loader, model=self.model) return collection, queries