def splitDatapoints(self): cached_mappings = load_obj(self.MAP_FILE) datapoints = cached_mappings['file'] indexes = cached_mappings['index'] allIndexes = list(range(len(datapoints))) np.random.shuffle(allIndexes) training_offset = int(len(allIndexes) * 0.8) validation_offset = int(len(allIndexes) * 0.9) training_indexes = allIndexes[:training_offset] validation_indexes = allIndexes[training_offset:validation_offset] testing_indexes = allIndexes[validation_offset:] save_obj( { 'train': self.getValues(datapoints, indexes, training_indexes), 'val': self.getValues(datapoints, indexes, validation_indexes), 'test': self.getValues(datapoints, indexes, testing_indexes) }, self.SPLIT_FILE)
def compute_relevance_matrices(documents, annotations, thesaurus, vocab): for th in thesaurus: with open('last_th.txt', 'w') as f: f.write(th) # hepls separate relevant docs from non-relevant ones corpus = defaultdict(lambda: []) # mark relevance for all documents for doc, tags, i in zip(documents, annotations, range(len(documents))): if th in tags: corpus['relevant'].append(doc) else: corpus['nonrelevant'].append(doc) # Word occurrences in relevant and non relevant documents rel_count_vec = word_occurence(corpus['relevant'], vocab) non_count_vec = word_occurence(corpus['nonrelevant'], vocab) # Number of relevant and non-relevant documents N_rel = len(corpus['relevant']) N_non = len(corpus['nonrelevant']) # Compute maximum likelihood p_prob = compute_mle_vector(rel_count_vec, N_rel) q_prob = compute_mle_vector(non_count_vec, N_non) # save probabilities on disk save_obj(obj=(p_prob, q_prob), name=th)
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve): """ :return: """ number_of_documents = 0 config = ConfigClass(corpus_path, output_path, stemming) r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(stemming) indexer = Indexer(config, p.terms_dic_to_document) # Iterate over every document in the file for i in r.filesPath: documents_list = r.read_file(i) start_time = time.time() for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) # update the number of doc in system number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) # print(time.time() - start_time) print('--------------------------') print('Start writing to disk left overs') indexer.save_all_left_overs() print('Finish without waiting ' + str(time.time() - start_time)) print('Start waiting') indexer.wait_untill_all_finish() print('End Waiting') print('Finished writing to disk left overs') print('--------------------------') print('Finished parsing and indexing. Starting to export files') print('Finish all Time ' + str(time.time() - start_time)) utils.save_obj(indexer.inverted_idx, "inverted_idx")
def Q1_1(category): with open(fileLocation(category), encoding="utf8") as f: tweets = f.readlines() firstTs = FIRST_TS[category] firstTs = firstTs // 3600 * 3600 lastTs = LAST_TS[category] totalHours = tsDiffHour(firstTs, lastTs) + 1 hourCount = [0] * totalHours followerCount = 0 retweetCount = 0 for tweet in tweets: t = json.loads(tweet) ts = t['citation_date'] # count hour hourDiff = tsDiffHour(firstTs, ts) hourCount[hourDiff] += 1 # count follower followerCount += t['author']['followers'] # count retweets retweetCount += t['metrics']['citations']['total'] save_obj(category + '_numTweetsInHour', hourCount) # report average number of tweets per hour print(category + ': ' + 'Average number of tweets per hour: ' + str(np.mean(hourCount))) print(category + ': ' + 'Average number of followers of users posting the tweets: ' + str(followerCount / len(tweets))) print(category + ': ' + 'Average number of retweets: ' + str(retweetCount / len(tweets)))
def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) to_del = [] def remove_word_1(): for key in self._indexer.inverted_idx: if (self._indexer.inverted_idx[key] == 1 and key.isalpha() == False): to_del.append(key) self._indexer.postingDict.pop(key) for key in to_del: self._indexer.inverted_idx.pop(key) to_Save = (self._indexer.inverted_idx, self._indexer.postingDict, self._indexer.num_of_docs, self._indexer.avg_Size_doc) utils.save_obj(to_Save, "index_3")
def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.add_square_Wij() to_Save=(self._indexer.inverted_idx, self._indexer.postingDict, self._indexer.num_of_docs, self._indexer.avg_Size_doc,self._indexer.doc_info) utils.save_obj(to_Save, "index_4") print('Finished parsing and indexing.')
def remove_uppercase_and_entities(self, indexer): word_in_lower_and_upper = [] inverted_idx = indexer.inverted_idx # check if word whom found in upper case also found in lower. if yes - remove from posting files (and inverted index) for letter in self.uppercase_dict: upper_to_lower_words = [ x.lower() for x in list(self.uppercase_dict[letter]) ] for word in upper_to_lower_words: if word in inverted_idx: word_in_lower_and_upper.append(word) letter_posting_file = utils.load_obj(indexer.out + letter.lower()) for word in word_in_lower_and_upper: if word in letter_posting_file and word.upper( ) in letter_posting_file: # TODO why do we need to check this - debug word_appearance = letter_posting_file[word.upper()] letter_posting_file[word].extend(word_appearance) del letter_posting_file[word.upper()] del inverted_idx[word.upper()] # entities - check if they appear at least twice. if not - remove from posting files (and inverted index) for entity in self.entities_dict[letter]: if entity in letter_posting_file and len( letter_posting_file[entity]) < 2: del letter_posting_file[entity] del inverted_idx[entity] utils.save_obj(letter_posting_file, indexer.out + letter)
def save(self, filename): """ Dump current mesh into an *.obj file :param filename: Filename """ utils.save_obj(self.vertex, self.tri, filename)
def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 doc_len = len(documents_list) for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document, doc_len) # print('Finished parsing and indexing.') # print('Finished marge, start rebuild posting dict') # self._indexer.rebuild_postingDict() self._indexer.rebuild_inverted_index() # print('finished rebuild inverted index') to_save = (self._indexer.inverted_idx, self._indexer.tweet_dict, self._indexer.reversed_inverted_index) utils.save_obj(to_save, 'idx_bench') # TODO: inverted_idx, tweet_dict,reversed_inverted_index, to_save change to None self._indexer.inverted_idx = None self._indexer.tweet_dict = None self._indexer.reversed_inverted_index = None to_save = None
def run_engine(corpus_path='', output_path='', stemming=False): """ :return: """ # Create PostingFile directory if it doesn't exist number_of_documents = 0 config = ConfigClass() r = ReadFile(corpus_path=corpus_path) p = Parse(stemming) indexer = Indexer(config, output_path) # Get all parquet files from corpus path parquets = [] for root, dirs, files in os.walk(corpus_path): for name in files: if name.endswith((".parquet", ".htm")): parquets.append((root, name)) for index in range(len(parquets)): r.corpus_path = parquets[index][0] documents_list = r.read_file(file_name=parquets[index][1]) # Create a new process for each document with Pool(CPUCOUNT) as _p: for parsed_doc in _p.imap_unordered(p.parse_doc, documents_list): number_of_documents += 1 indexer.add_new_doc(parsed_doc) _p.close() _p.join() p.entities.clear() indexer.finish_index() save_obj(indexer.term_dict, output_path + '/' + "inverted_idx") save_obj(indexer.document_dict, output_path + '/' + "doc_dictionary") indexer.document_dict.clear() indexer.term_dict.clear()
def train(df, attrs, clf_class, clf_name, model_params, mode, magic_number, dates, dataset_name, trading_params): trade_freq = trading_params['trade_frequency'] name = '%s-%s-attr%s-%s-%s-%s-%s-%s_' % ( clf_name, dataset_name, len(attrs), dict_to_str(model_params).replace( ' ', '_').replace(':', ''), mode, magic_number, pd.to_datetime(dates[0], format=DATE_FORMAT).date(), pd.to_datetime(dates[1], format=DATE_FORMAT).date()) cached_file = os.path.join(CACHE_PATH + '/models/', name) start_date, final_date = dates idx = 0 indices = sorted([ day for day in list(set(df.index.values)) if start_date <= day <= final_date ]) print("Model and params: %s %s " % (clf_name, model_params)) # magic number is by default 53, 52 weeks for training 1 for prediction while idx + magic_number < len(indices) and indices[idx + magic_number] <= \ indices[-1]: if mode == CLASSIFICATION: train_x, train_y, test_x, test_y = \ get_classification_data(clf_name, df, attrs, indices, idx, magic_number) elif mode == REGRESSION: # get regression datasets (target is float y -> ratio of increase) train_x, train_y, test_x, test_y = \ get_regression_data(clf_name, df, attrs, indices, idx, magic_number) print( "Training %s/%s with %s instances." % (idx // trade_freq, len(indices) // trade_freq, train_x.shape[0])) sys.stdout.flush() clf_cached_file = cached_file + str(indices[idx])[:10] if not CHECKPOINTING: clf = clf_class(**model_params).fit(train_x, train_y) else: try: clf = load_obj(clf_cached_file) except: clf = clf_class(**model_params).fit(train_x, train_y) save_obj(clf, clf_cached_file) pred = clf.predict(test_x) # import ipdb # ipdb.set_trace() df.loc[indices[idx + magic_number], clf_name] = pred idx += trade_freq df_trade = df.dropna(axis=0) print("Finished training for %s" % (clf_name)) return df_trade
def create_database(self): all_infoss = self.read_info_files() if self.shuffle: from random import shuffle shuffle(all_infoss) else: all_infoss.sort(key=lambda tup: int(tup[0])) age_infos, sex_infos = self.edit_all_infos(all_infoss) self.make_folders() # Copy Age Class Images train_age, test_age = self.split_infos(age_infos,age=True) age_train_occured_labels = self.copy_all2(train_age,self.db_age_train_folder_path) self.copy_all2(test_age,self.db_age_test_folder_path) age_classs_weight = self.calculate_class_weights(self.age_labels, age_train_occured_labels) utils.save_obj(age_classs_weight,self.db_age_folder_path) print("AGE CLASS WEIGHT") print(age_classs_weight) # Copy Sex Class Images train_sex, test_sex = self.split_infos(sex_infos,age=False) sex_train_occured_labels = self.copy_all2(train_sex,self.db_sex_train_folder_path) self.copy_all2(test_sex,self.db_sex_test_folder_path) sex_classs_weight = self.calculate_class_weights(self.sex_labels, sex_train_occured_labels) utils.save_obj(sex_classs_weight,self.db_sex_folder_path) print("SEX CLASS WEIGHT") print(sex_classs_weight)
def build_index_from_parquet(self, fn): time1 = time.time() """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) to_del = [] #saving the object that needed while searching to_Save = (self._indexer.inverted_idx, self._indexer.postingDict, self._indexer.num_of_docs, self._indexer.avg_Size_doc) utils.save_obj(to_Save, "index_best") def remove_word_1(): for key in self._indexer.inverted_idx: if (self._indexer.inverted_idx[key] == 1): to_del.append(key) self._indexer.postingDict.pop(key) for key in to_del: self._indexer.inverted_idx.pop( key) # DO NOT MODIFY THIS SIGNATURE
def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ # r = ReadFile(ConfigClass.corpusPath) # documents_list = r.readAllCorpus() #change if we need to read more then 1 parquet df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() utils.save_obj( {}, "inverted_idx" ) # needed to pass boris tests, sometimes, inverted_idx fails to save in testings system # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 if parsed_document.doc_length != 0: #sometimes we get an empty tweet, no need to index them # index the document data self._indexer.add_new_doc(parsed_document) # Inserting entities to the indexer and posting files self._indexer.addEntities(self._parser.suspectedEntityDict) # Sort the posting files self._indexer.update_idfWij(idx) self._indexer.save_index("inverted_idx") print('Finished parsing and indexing.')
def initialize_buckets(self, num_of_buckets, first_bucket_index=0): """ Creates the bucket files on the disk and saves a mapping for their indices """ for i in range(num_of_buckets): utils.save_obj([], "bucket" + str(first_bucket_index + i)) self.buckets_mapping[i] = first_bucket_index + i
def save_index(self, fn): """ Saves a pre-computed index (or indices) so we can save our work. Input: fn - file name of pickled index. """ utils.save_obj(self.inverted_idx, fn)
def save_doc(self): if len(self.docs_posting) > 0: # self.docs_inverted[self.docs_counter] = self.docs_list_for_inverted utils.save_obj(self.docs_posting, self.config.get_savedFileMainFolder() + '\\doc' + str(self.docs_counter)) self.num_of_docs_in_posting = 0 # self.docs_list_for_inverted = [] self.docs_counter += 1 self.docs_posting = {}
def save_index(self, fn): """ Saves a pre-computed index (or indices) so we can save our work. Input: fn - file name of pickled index. """ utils.save_obj((self.inverted_idx, self.postingDict, self.docs_dict, self.pop_dict), fn)
def save_in_merge(self, merged_posting, merged_list): utils.save_obj( merged_posting, self.config.get_savedFileMainFolder() + "\\" + str(self.posting_files_counter)) merged_list.append(self.posting_files_counter) self.posting_files_counter += 1 return {}
def save_index(self, fn): """ Saves a pre-computed index (or indices) so we can save our work. Input: fn - file name of pickled index. """ avg_doc_len = Indexer.avg_doc_len utils.save_obj([self.inverted_idx, self.postingDict, self.doc_file, avg_doc_len], fn)
def save_index(self, fn): """ Saves a pre-computed index (or indices) so we can save our work. Input: fn - file name of pickled index. """ index_tup = (self.inverted_idx_doc, self.inverted_idx_term) utils.save_obj(index_tup, fn)
def create_search_dict(vocabulary_dict, embedding_dict=None): if not embedding_dict: embedding_dict = utils.load_obj("embedding_dict") new_embedding_dict = {} for word in embedding_dict.keys(): if word in vocabulary_dict.keys(): new_embedding_dict[word] = embedding_dict[word] utils.save_obj(new_embedding_dict, "new_embedding_dict")
def clean_memory(self): # Save posting file as pickle and clear all buckets in RAM for bucket_id, bucket_dict in self.buckets.items(): save_obj( bucket_dict, self.POSTING_PATH + '/' + bucket_id + "_" + str(self.current_dump)) self.buckets[bucket_id] = {} self.current_dump += 1
def measure(self, generated, vessels, masks, num_data, iter_time, phase, total_time): # masking vessels_in_mask, generated_in_mask = utils.pixel_values_in_mask( vessels, generated, masks) # averaging processing time avg_pt = (total_time / num_data) * 1000 # average processing tiem # evaluate Area Under the Curve of ROC and Precision-Recall auc_roc = utils.AUC_ROC(vessels_in_mask, generated_in_mask) auc_pr = utils.AUC_PR(vessels_in_mask, generated_in_mask) # binarize to calculate Dice Coeffient binarys_in_mask = utils.threshold_by_otsu(generated, masks) dice_coeff = utils.dice_coefficient_in_train(vessels_in_mask, binarys_in_mask) acc, sensitivity, specificity = utils.misc_measures( vessels_in_mask, binarys_in_mask) score = auc_pr + auc_roc + dice_coeff + acc + sensitivity + specificity # # auc_sum for saving best model in training # auc_sum = auc_roc + auc_pr # if self.flags.stage == 2: # #auc_sum = auc_roc + auc_pr # auc_sum = auc_roc + auc_pr # else: # auc_sum = auc_roc + auc_pr auc_sum = dice_coeff + acc + auc_pr # print information ord_output = collections.OrderedDict([('auc_pr', auc_pr), ('auc_roc', auc_roc), ('dice_coeff', dice_coeff), ('acc', acc), ('sensitivity', sensitivity), ('specificity', specificity), ('score', score), ('auc_sum', auc_sum), ('best_auc_sum', self.best_auc_sum), ('avg_pt', avg_pt)]) utils.print_metrics(iter_time, ord_output) # write in tensorboard when in train mode only if phase == 'train': self.model.measure_assign(auc_pr, auc_roc, dice_coeff, acc, sensitivity, specificity, score, iter_time) elif phase == 'test': # write in npy format for evaluation utils.save_obj(vessels_in_mask, generated_in_mask, os.path.join(self.auc_out_dir, "auc_roc.npy"), os.path.join(self.auc_out_dir, "auc_pr.npy")) return auc_sum
def merge_index(config, files_num): """ The function loads all the temporary index files that was made by the parse_and_index function and merge them into a united index. The function deals with the capital letters rule, where all the occurences of a term are starting with capital letters, it will be save in all capital. Otherwise it will be saved in the lower version. The function also merge the entites into the inverted index in case they appear in the corpus more than once. The function save the merged index to the disk for future use. :param config: config class that contains info about where to retrieve the saved files :param files_num: How many temporary files to merge in each category :return: Number of total terms in the index """ merged_index = {} # Just merge all the terms in the index into one index file_prefix = config.get_save_files_dir() + "/tmp/inverted_idx_" for i in range(files_num): current_index = utils.load_obj(file_prefix + str(i)) for term, apperances in current_index.items(): if term not in merged_index.keys(): merged_index[term] = apperances else: merged_index[term] += apperances # Handle the capital restriction merged_index_after_cap = {} for term, value in merged_index.items(): if term[0].islower(): if term not in merged_index_after_cap.keys(): merged_index_after_cap[term] = value else: merged_index_after_cap[term] += value else: # case it contains uppercase if term.lower() in merged_index.keys( ): # case there is the same term in lower somewhere in the corpus if term.lower() not in merged_index_after_cap.keys(): merged_index_after_cap[term.lower()] = value else: merged_index_after_cap[term.lower()] += value else: # case it is actually capital only merged_index_after_cap[term.upper()] = value # Check if an entity appears more than once in the corpus it's being added to the index entities_idxs_prefix = config.get_save_files_dir() + "/tmp/entities_idx_" for i in range(files_num): current_entities = utils.load_obj(entities_idxs_prefix + str(i)) for term, apperances in current_entities.items(): if apperances > 1: merged_index_after_cap[term] = apperances total_terms = len(merged_index) #print("Total num of terms: {}".format(total_terms)) # Save the merged index to disk saving_dir = config.get_save_files_dir() utils.save_obj(merged_index_after_cap, saving_dir + "/inverted_index") return total_terms
def clearSingleEntities(inv_index, parser, output_path, num_of_docs_in_corpus): """ :param inv_index: inv_index :param parser: parser :param output_path: output_path :param num_of_docs_in_corpus: #docs in corpus :return: """ EntitiesDict = {} #{doc_id: [term1,term2]} docs_to_clear = {} # {pkl_id: [doc1 ,doc2]} vectorsDict = {} # {doc_id: normalized vectors} # for each term in inv_index, check if should be cleared up. # term will be cleared if it is a single entity or term in whole corpus. for term in inv_index.keys(): if inv_index[term][0] == 1: single_doc = inv_index[term][1][0] if single_doc in EntitiesDict.keys(): EntitiesDict[single_doc].append(term) else: EntitiesDict[single_doc] = [term] # if there's no entities to remove, return. if len(EntitiesDict.keys()) == 0: return sorted_keys = sorted(EntitiesDict.keys()) # all docs to clear key_num = int(sorted_keys[0] / indexer.postingSize) docs_to_clear[key_num] = [] for doc_id in sorted_keys: if doc_id >= ( key_num + 1 ) * indexer.postingSize: # should get new data, update key_num key_num = int(doc_id / indexer.postingSize) docs_to_clear[key_num] = [doc_id] else: docs_to_clear[key_num] += [doc_id] for pkl_key in docs_to_clear.keys(): data = utils.load_obj(output_path + '/PostingFiles/' + str(pkl_key)) for doc_id in data.keys(): #key is now a string doc_idint = int(doc_id) if doc_idint in EntitiesDict.keys(): for entity in EntitiesDict[doc_idint]: if len(data[doc_id][3]) >= 5 or parser.isEntity(entity): data[doc_id][1] -= data[doc_id][3][entity] data[doc_id][3].pop(entity) inv_index.pop(entity) values = data[doc_id][3].values() if len(values) != 0: data[doc_id][2] = max(values) else: data[doc_id][2] = 0 updateVectorsFile(doc_id, data[doc_id], inv_index, num_of_docs_in_corpus, vectorsDict) utils.save_obj(data, output_path + '/PostingFiles/' + str(pkl_key)) utils.save_obj(vectorsDict, output_path + '/PostingFiles/vectorsFile') vectorsDict.clear() docs_to_clear.clear() EntitiesDict.clear()
def save_index(self, fn): """ Saves a pre-computed index (or indices) so we can save our work. Input: fn - file name of pickled index. """ # raise NotImplementedError indexer = (self.inverted_idx, self.postingDict, self.documents_dict) utils.save_obj(indexer, fn)
def save_meshes(reorganize_idx, outputs, output_dir, smpl_faces): vids_org = np.unique(reorganize_idx) for idx, vid in enumerate(vids_org): verts_vids = np.where(reorganize_idx==vid)[0] img_path = outputs['meta_data']['imgpath'][verts_vids[0]] obj_name = os.path.join(output_dir, '{}'.format(os.path.basename(img_path))).replace('.mp4','').replace('.jpg','').replace('.png','')+'.obj' for subject_idx, batch_idx in enumerate(verts_vids): save_obj(outputs['verts'][batch_idx].detach().cpu().numpy().astype(np.float16), \ smpl_faces,obj_name.replace('.obj', '_{}.obj'.format(subject_idx)))
def compute_embeddings(args): # Load pre-trained model tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) model = BertModel.from_pretrained(args.bert_model) model.to(args.device) model.eval() # Prepare data allreasons = read_data(args) features = convert_to_features(allreasons, tokenizer, args.max_seq_length) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_ids = torch.tensor([f.instance_id for f in features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Run prediction for full data logger.info("***** Extracting pre-trained BERT embeddings *****") logger.info("Num samples = %d", len(allreasons)) for batch_idx, inputs in enumerate(eval_dataloader): # Send data to GPU input_ids = inputs[0].to(args.device) input_mask = inputs[1].to(args.device) segment_ids = inputs[2].to(args.device) # Apply model with torch.no_grad(): _, cls_output = model(input_ids, segment_ids, input_mask, output_all_encoded_layers=False) # Save embeddings if batch_idx == 0: embeddings = cls_output.data.cpu().numpy() else: embeddings = np.concatenate( (embeddings, cls_output.data.cpu().numpy()), axis=0) # Save in a file logger.info('Embeddings shape %s' % str(embeddings.shape)) fileout = os.path.join(args.data_dir, args.embsfile) if not os.path.exists(os.path.dirname(fileout)): os.mkdir(os.path.dirname(fileout)) utils.save_obj(embeddings, fileout) logger.info('Embeddings saved into %s' % fileout)
def write_batch_postings(self): """ Writes all partial posting files in current batch to disk in .pkl form """ for posting_batch_pointer, posting_batch in self.postingDict.items(): utils.save_obj( posting_batch, self.output_path + "{}".format(posting_batch_pointer))
def set(self, filename, data): with self._disk_lock: if type(filename) == types.UnicodeType: filename = filename.encode('utf-8') self.disk_write_count += 1 if filename in self.disk_cache: del self.disk_cache[filename] if len(self.disk_cache) == self.disk_cache_size: old_filename = self.disk_cache.popitem(last = False)[0] os.unlink(self.disk_cache_dir + old_filename) self.disk_cache[filename] = True utils.save_obj(self.disk_cache_dir + filename, data) super(LifoCache, self).set(filename, data)
def draw_centers(self): # make the appropriate directory dir_name = os.path.join(_output_dir, str(datetime.datetime.now()).replace(' ', '_') + "_".join(sys.argv[1:])) try: os.mkdir(dir_name) except: pass # write the arguments so we know what we did f = open(os.path.join(dir_name, "args.txt"), 'w') f.write(repr(sys.argv)) f.close() # save clusters to file utils.save_obj(self.confirm, os.path.join(dir_name, "confirm.pkl")) # draw each cluster center for cluster in self.clusters: im = cluster.center.draw() im.save(os.path.join(dir_name, "cluster_%d.png" % cluster._id))
def save_blacklist(self, blacklist): result = self.load_blacklist() for s in blacklist: result.add(s.split(u':')[0].strip()) filename = self.blacklist_filename() utils.save_obj(filename, result)
def save_dicts(self, variant, cache): filename = self.cache_filename(variant) utils.save_obj(filename, cache)
def save(self, filename): # no need of a lock here. items = self.copy_items() utils.save_obj(filename, items)
def close(self): self.fd_data.close() utils.save_obj(self.filename + '.index', self.index)