コード例 #1
0
ファイル: data.py プロジェクト: samarthbhargav/hackathon4good
    def splitDatapoints(self):
        cached_mappings = load_obj(self.MAP_FILE)

        datapoints = cached_mappings['file']
        indexes = cached_mappings['index']

        allIndexes = list(range(len(datapoints)))

        np.random.shuffle(allIndexes)

        training_offset = int(len(allIndexes) * 0.8)

        validation_offset = int(len(allIndexes) * 0.9)

        training_indexes = allIndexes[:training_offset]

        validation_indexes = allIndexes[training_offset:validation_offset]

        testing_indexes = allIndexes[validation_offset:]

        save_obj(
            {
                'train': self.getValues(datapoints, indexes, training_indexes),
                'val': self.getValues(datapoints, indexes, validation_indexes),
                'test': self.getValues(datapoints, indexes, testing_indexes)
            }, self.SPLIT_FILE)
コード例 #2
0
def compute_relevance_matrices(documents, annotations, thesaurus, vocab):

    for th in thesaurus:
        with open('last_th.txt', 'w') as f:
            f.write(th)

        # hepls separate relevant docs from non-relevant ones
        corpus = defaultdict(lambda: [])

        # mark relevance for all documents
        for doc, tags, i in zip(documents, annotations, range(len(documents))):
            if th in tags:
                corpus['relevant'].append(doc)
            else:
                corpus['nonrelevant'].append(doc)

        # Word occurrences in relevant and non relevant documents
        rel_count_vec = word_occurence(corpus['relevant'], vocab)
        non_count_vec = word_occurence(corpus['nonrelevant'], vocab)

        # Number of relevant and non-relevant documents
        N_rel = len(corpus['relevant'])
        N_non = len(corpus['nonrelevant'])

        # Compute maximum likelihood
        p_prob = compute_mle_vector(rel_count_vec, N_rel)
        q_prob = compute_mle_vector(non_count_vec, N_non)

        # save probabilities on disk
        save_obj(obj=(p_prob, q_prob), name=th)
コード例 #3
0
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve):
    """
    :return:
    """
    number_of_documents = 0

    config = ConfigClass(corpus_path, output_path, stemming)
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(stemming)
    indexer = Indexer(config, p.terms_dic_to_document)
    # Iterate over every document in the file
    for i in r.filesPath:
        documents_list = r.read_file(i)
        start_time = time.time()
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = p.parse_doc(document)
            # update the number of doc in system
            number_of_documents += 1
            # index the document data
            indexer.add_new_doc(parsed_document)
        # print(time.time() - start_time)
    print('--------------------------')
    print('Start writing to disk left overs')
    indexer.save_all_left_overs()
    print('Finish without waiting ' + str(time.time() - start_time))
    print('Start waiting')
    indexer.wait_untill_all_finish()
    print('End Waiting')
    print('Finished writing to disk left overs')
    print('--------------------------')
    print('Finished parsing and indexing. Starting to export files')
    print('Finish all Time ' + str(time.time() - start_time))
    utils.save_obj(indexer.inverted_idx, "inverted_idx")
コード例 #4
0
ファイル: Project5.py プロジェクト: shacocn/EE219
def Q1_1(category):
    with open(fileLocation(category), encoding="utf8") as f:
        tweets = f.readlines()
        firstTs = FIRST_TS[category]
        firstTs = firstTs // 3600 * 3600
        lastTs = LAST_TS[category]
        totalHours = tsDiffHour(firstTs, lastTs) + 1

        hourCount = [0] * totalHours
        followerCount = 0
        retweetCount = 0

        for tweet in tweets:
            t = json.loads(tweet)
            ts = t['citation_date']
            # count hour
            hourDiff = tsDiffHour(firstTs, ts)
            hourCount[hourDiff] += 1
            # count follower
            followerCount += t['author']['followers']
            # count retweets
            retweetCount += t['metrics']['citations']['total']

        save_obj(category + '_numTweetsInHour', hourCount)
        # report average number of tweets per hour
        print(category + ': ' + 'Average number of tweets per hour: ' +
              str(np.mean(hourCount)))
        print(category + ': ' +
              'Average number of followers of users posting the tweets: ' +
              str(followerCount / len(tweets)))
        print(category + ': ' + 'Average number of retweets: ' +
              str(retweetCount / len(tweets)))
コード例 #5
0
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)

        to_del = []

        def remove_word_1():
            for key in self._indexer.inverted_idx:
                if (self._indexer.inverted_idx[key] == 1
                        and key.isalpha() == False):
                    to_del.append(key)
                    self._indexer.postingDict.pop(key)
            for key in to_del:
                self._indexer.inverted_idx.pop(key)

        to_Save = (self._indexer.inverted_idx, self._indexer.postingDict,
                   self._indexer.num_of_docs, self._indexer.avg_Size_doc)
        utils.save_obj(to_Save, "index_3")
コード例 #6
0
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)

        self._indexer.add_square_Wij()
        to_Save=(self._indexer.inverted_idx, self._indexer.postingDict, self._indexer.num_of_docs, self._indexer.avg_Size_doc,self._indexer.doc_info)
        utils.save_obj(to_Save, "index_4")


        print('Finished parsing and indexing.')
コード例 #7
0
    def remove_uppercase_and_entities(self, indexer):
        word_in_lower_and_upper = []
        inverted_idx = indexer.inverted_idx

        # check if word whom found in upper case also found in lower. if yes - remove from posting files (and inverted index)
        for letter in self.uppercase_dict:
            upper_to_lower_words = [
                x.lower() for x in list(self.uppercase_dict[letter])
            ]
            for word in upper_to_lower_words:
                if word in inverted_idx:
                    word_in_lower_and_upper.append(word)

            letter_posting_file = utils.load_obj(indexer.out + letter.lower())
            for word in word_in_lower_and_upper:
                if word in letter_posting_file and word.upper(
                ) in letter_posting_file:  # TODO why do we need to check this - debug
                    word_appearance = letter_posting_file[word.upper()]
                    letter_posting_file[word].extend(word_appearance)
                    del letter_posting_file[word.upper()]
                    del inverted_idx[word.upper()]

            # entities - check if they appear at least twice. if not - remove from posting files (and inverted index)
            for entity in self.entities_dict[letter]:
                if entity in letter_posting_file and len(
                        letter_posting_file[entity]) < 2:
                    del letter_posting_file[entity]
                    del inverted_idx[entity]
            utils.save_obj(letter_posting_file, indexer.out + letter)
コード例 #8
0
ファイル: mesh.py プロジェクト: ownzonefeng/ntds_groupwork
    def save(self, filename):
        """
        Dump current mesh into an *.obj file

        :param filename:    Filename
        """
        utils.save_obj(self.vertex, self.tri, filename)
コード例 #9
0
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        doc_len = len(documents_list)
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document, doc_len)
        # print('Finished parsing and indexing.')

        # print('Finished marge, start rebuild posting dict')
        # self._indexer.rebuild_postingDict()
        self._indexer.rebuild_inverted_index()
        # print('finished rebuild inverted index')

        to_save = (self._indexer.inverted_idx, self._indexer.tweet_dict,
                   self._indexer.reversed_inverted_index)
        utils.save_obj(to_save, 'idx_bench')
        # TODO: inverted_idx, tweet_dict,reversed_inverted_index, to_save change to None
        self._indexer.inverted_idx = None
        self._indexer.tweet_dict = None
        self._indexer.reversed_inverted_index = None
        to_save = None
コード例 #10
0
def run_engine(corpus_path='', output_path='', stemming=False):
    """

    :return:
    """
    # Create PostingFile directory if it doesn't exist
    number_of_documents = 0
    config = ConfigClass()
    r = ReadFile(corpus_path=corpus_path)
    p = Parse(stemming)
    indexer = Indexer(config, output_path)
    # Get all parquet files from corpus path
    parquets = []
    for root, dirs, files in os.walk(corpus_path):
        for name in files:
            if name.endswith((".parquet", ".htm")):
                parquets.append((root, name))

    for index in range(len(parquets)):
        r.corpus_path = parquets[index][0]
        documents_list = r.read_file(file_name=parquets[index][1])
        # Create a new process for each document
        with Pool(CPUCOUNT) as _p:
            for parsed_doc in _p.imap_unordered(p.parse_doc, documents_list):
                number_of_documents += 1
                indexer.add_new_doc(parsed_doc)
            _p.close()
            _p.join()

    p.entities.clear()
    indexer.finish_index()
    save_obj(indexer.term_dict, output_path + '/' + "inverted_idx")
    save_obj(indexer.document_dict, output_path + '/' + "doc_dictionary")
    indexer.document_dict.clear()
    indexer.term_dict.clear()
コード例 #11
0
def train(df, attrs, clf_class, clf_name, model_params, mode, magic_number,
          dates, dataset_name, trading_params):
    trade_freq = trading_params['trade_frequency']
    name = '%s-%s-attr%s-%s-%s-%s-%s-%s_' % (
        clf_name, dataset_name, len(attrs), dict_to_str(model_params).replace(
            ' ', '_').replace(':', ''), mode, magic_number,
        pd.to_datetime(dates[0], format=DATE_FORMAT).date(),
        pd.to_datetime(dates[1], format=DATE_FORMAT).date())
    cached_file = os.path.join(CACHE_PATH + '/models/', name)

    start_date, final_date = dates
    idx = 0

    indices = sorted([
        day for day in list(set(df.index.values))
        if start_date <= day <= final_date
    ])

    print("Model and params: %s %s " % (clf_name, model_params))
    # magic number is by default 53, 52 weeks for training 1 for prediction
    while idx + magic_number < len(indices) and indices[idx + magic_number] <= \
            indices[-1]:

        if mode == CLASSIFICATION:
            train_x, train_y, test_x, test_y = \
                get_classification_data(clf_name, df, attrs, indices, idx,
                                        magic_number)
        elif mode == REGRESSION:
            # get regression datasets (target is float y -> ratio of increase)
            train_x, train_y, test_x, test_y = \
                get_regression_data(clf_name, df, attrs, indices, idx,
                                    magic_number)

        print(
            "Training %s/%s with %s instances." %
            (idx // trade_freq, len(indices) // trade_freq, train_x.shape[0]))
        sys.stdout.flush()

        clf_cached_file = cached_file + str(indices[idx])[:10]

        if not CHECKPOINTING:
            clf = clf_class(**model_params).fit(train_x, train_y)
        else:
            try:
                clf = load_obj(clf_cached_file)
            except:
                clf = clf_class(**model_params).fit(train_x, train_y)
                save_obj(clf, clf_cached_file)

        pred = clf.predict(test_x)

        # import ipdb
        # ipdb.set_trace()
        df.loc[indices[idx + magic_number], clf_name] = pred

        idx += trade_freq
    df_trade = df.dropna(axis=0)

    print("Finished training for %s" % (clf_name))
    return df_trade
    def create_database(self):
        all_infoss = self.read_info_files()
        
        if self.shuffle:
            from random import shuffle
            shuffle(all_infoss)
        else:
            all_infoss.sort(key=lambda tup: int(tup[0]))
                
        age_infos, sex_infos = self.edit_all_infos(all_infoss)
        
        self.make_folders()

#       Copy Age Class Images        
        train_age,  test_age = self.split_infos(age_infos,age=True)
        age_train_occured_labels = self.copy_all2(train_age,self.db_age_train_folder_path)
        self.copy_all2(test_age,self.db_age_test_folder_path)
        
        age_classs_weight = self.calculate_class_weights(self.age_labels, age_train_occured_labels)
        
        utils.save_obj(age_classs_weight,self.db_age_folder_path)
        print("AGE CLASS WEIGHT")
        print(age_classs_weight)

#       Copy Sex Class Images        
        train_sex, test_sex = self.split_infos(sex_infos,age=False)
        sex_train_occured_labels = self.copy_all2(train_sex,self.db_sex_train_folder_path)
        self.copy_all2(test_sex,self.db_sex_test_folder_path)
        
        sex_classs_weight = self.calculate_class_weights(self.sex_labels, sex_train_occured_labels)
        
        utils.save_obj(sex_classs_weight,self.db_sex_folder_path)
        print("SEX CLASS WEIGHT")
        print(sex_classs_weight)
コード例 #13
0
    def build_index_from_parquet(self, fn):
        time1 = time.time()
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)

        to_del = []

        #saving the object that needed while searching
        to_Save = (self._indexer.inverted_idx, self._indexer.postingDict,
                   self._indexer.num_of_docs, self._indexer.avg_Size_doc)
        utils.save_obj(to_Save, "index_best")

        def remove_word_1():
            for key in self._indexer.inverted_idx:
                if (self._indexer.inverted_idx[key] == 1):
                    to_del.append(key)
                    self._indexer.postingDict.pop(key)
            for key in to_del:
                self._indexer.inverted_idx.pop(
                    key)  # DO NOT MODIFY THIS SIGNATURE
コード例 #14
0
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """

        # r = ReadFile(ConfigClass.corpusPath)
        # documents_list = r.readAllCorpus() #change if we need to read more then 1 parquet

        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()

        utils.save_obj(
            {}, "inverted_idx"
        )  # needed to pass boris tests, sometimes, inverted_idx fails to save in testings system

        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            if parsed_document.doc_length != 0:  #sometimes we get an empty tweet, no need to index them
                # index the document data
                self._indexer.add_new_doc(parsed_document)
        # Inserting entities to the indexer and posting files
        self._indexer.addEntities(self._parser.suspectedEntityDict)
        # Sort the posting files
        self._indexer.update_idfWij(idx)
        self._indexer.save_index("inverted_idx")
        print('Finished parsing and indexing.')
コード例 #15
0
 def initialize_buckets(self, num_of_buckets, first_bucket_index=0):
     """
     Creates the bucket files on the disk and saves a mapping for their indices
     """
     for i in range(num_of_buckets):
         utils.save_obj([], "bucket" + str(first_bucket_index + i))
         self.buckets_mapping[i] = first_bucket_index + i
コード例 #16
0
 def save_index(self, fn):
     """
     Saves a pre-computed index (or indices) so we can save our work.
     Input:
           fn - file name of pickled index.
     """
     utils.save_obj(self.inverted_idx, fn)
コード例 #17
0
ファイル: oldIndexer.py プロジェクト: GalAgas/SEPartC
 def save_doc(self):
     if len(self.docs_posting) > 0:
         # self.docs_inverted[self.docs_counter] = self.docs_list_for_inverted
         utils.save_obj(self.docs_posting, self.config.get_savedFileMainFolder() + '\\doc' + str(self.docs_counter))
         self.num_of_docs_in_posting = 0
         # self.docs_list_for_inverted = []
         self.docs_counter += 1
         self.docs_posting = {}
コード例 #18
0
 def save_index(self, fn):
     """
     Saves a pre-computed index (or indices) so we can save our work.
     Input:
           fn - file name of pickled index.
     """
     utils.save_obj((self.inverted_idx, self.postingDict, self.docs_dict,
                     self.pop_dict), fn)
コード例 #19
0
ファイル: indexer.py プロジェクト: RonitTsysar/Search_Engine
 def save_in_merge(self, merged_posting, merged_list):
     utils.save_obj(
         merged_posting,
         self.config.get_savedFileMainFolder() + "\\" +
         str(self.posting_files_counter))
     merged_list.append(self.posting_files_counter)
     self.posting_files_counter += 1
     return {}
コード例 #20
0
ファイル: indexer.py プロジェクト: noaakl/search_engine_1
 def save_index(self, fn):
     """
     Saves a pre-computed index (or indices) so we can save our work.
     Input:
           fn - file name of pickled index.
     """
     avg_doc_len = Indexer.avg_doc_len
     utils.save_obj([self.inverted_idx, self.postingDict, self.doc_file, avg_doc_len], fn)
コード例 #21
0
ファイル: indexer.py プロジェクト: GalAgas/SEPartC
 def save_index(self, fn):
     """
     Saves a pre-computed index (or indices) so we can save our work.
     Input:
           fn - file name of pickled index.
     """
     index_tup = (self.inverted_idx_doc, self.inverted_idx_term)
     utils.save_obj(index_tup, fn)
コード例 #22
0
def create_search_dict(vocabulary_dict, embedding_dict=None):
    if not embedding_dict:
        embedding_dict = utils.load_obj("embedding_dict")
    new_embedding_dict = {}
    for word in embedding_dict.keys():
        if word in vocabulary_dict.keys():
            new_embedding_dict[word] = embedding_dict[word]
    utils.save_obj(new_embedding_dict, "new_embedding_dict")
コード例 #23
0
 def clean_memory(self):
     # Save posting file as pickle and clear all buckets in RAM
     for bucket_id, bucket_dict in self.buckets.items():
         save_obj(
             bucket_dict, self.POSTING_PATH + '/' + bucket_id + "_" +
             str(self.current_dump))
         self.buckets[bucket_id] = {}
     self.current_dump += 1
コード例 #24
0
    def measure(self, generated, vessels, masks, num_data, iter_time, phase,
                total_time):
        # masking
        vessels_in_mask, generated_in_mask = utils.pixel_values_in_mask(
            vessels, generated, masks)

        # averaging processing time
        avg_pt = (total_time / num_data) * 1000  # average processing tiem

        # evaluate Area Under the Curve of ROC and Precision-Recall
        auc_roc = utils.AUC_ROC(vessels_in_mask, generated_in_mask)
        auc_pr = utils.AUC_PR(vessels_in_mask, generated_in_mask)

        # binarize to calculate Dice Coeffient
        binarys_in_mask = utils.threshold_by_otsu(generated, masks)
        dice_coeff = utils.dice_coefficient_in_train(vessels_in_mask,
                                                     binarys_in_mask)
        acc, sensitivity, specificity = utils.misc_measures(
            vessels_in_mask, binarys_in_mask)
        score = auc_pr + auc_roc + dice_coeff + acc + sensitivity + specificity

        # # auc_sum for saving best model in training
        # auc_sum = auc_roc + auc_pr
        # if self.flags.stage == 2:
        #     #auc_sum = auc_roc + auc_pr
        #     auc_sum = auc_roc + auc_pr
        # else:
        #     auc_sum = auc_roc + auc_pr

        auc_sum = dice_coeff + acc + auc_pr

        # print information
        ord_output = collections.OrderedDict([('auc_pr', auc_pr),
                                              ('auc_roc', auc_roc),
                                              ('dice_coeff', dice_coeff),
                                              ('acc', acc),
                                              ('sensitivity', sensitivity),
                                              ('specificity', specificity),
                                              ('score', score),
                                              ('auc_sum', auc_sum),
                                              ('best_auc_sum',
                                               self.best_auc_sum),
                                              ('avg_pt', avg_pt)])
        utils.print_metrics(iter_time, ord_output)

        # write in tensorboard when in train mode only
        if phase == 'train':
            self.model.measure_assign(auc_pr, auc_roc, dice_coeff, acc,
                                      sensitivity, specificity, score,
                                      iter_time)
        elif phase == 'test':
            # write in npy format for evaluation
            utils.save_obj(vessels_in_mask, generated_in_mask,
                           os.path.join(self.auc_out_dir, "auc_roc.npy"),
                           os.path.join(self.auc_out_dir, "auc_pr.npy"))

        return auc_sum
コード例 #25
0
def merge_index(config, files_num):
    """
    The function loads all the temporary index files that was made by the parse_and_index function and merge them into
    a united index.
    The function deals with the capital letters rule, where all the occurences of a term are starting with capital
    letters, it will be save in all capital. Otherwise it will be saved in the lower version.
    The function also merge the entites into the inverted index in case they appear in the corpus more than once.
    The function save the merged index to the disk for future use.
    :param config: config class that contains info about where to retrieve the saved files
    :param files_num: How many temporary files to merge in each category
    :return: Number of total terms in the index
    """
    merged_index = {}

    # Just merge all the terms in the index into one index
    file_prefix = config.get_save_files_dir() + "/tmp/inverted_idx_"
    for i in range(files_num):
        current_index = utils.load_obj(file_prefix + str(i))
        for term, apperances in current_index.items():
            if term not in merged_index.keys():
                merged_index[term] = apperances
            else:
                merged_index[term] += apperances

    # Handle the capital restriction
    merged_index_after_cap = {}
    for term, value in merged_index.items():
        if term[0].islower():
            if term not in merged_index_after_cap.keys():
                merged_index_after_cap[term] = value
            else:
                merged_index_after_cap[term] += value
        else:  # case it contains uppercase
            if term.lower() in merged_index.keys(
            ):  # case there is the same term in lower somewhere in the corpus
                if term.lower() not in merged_index_after_cap.keys():
                    merged_index_after_cap[term.lower()] = value
                else:
                    merged_index_after_cap[term.lower()] += value
            else:  # case it is actually capital only
                merged_index_after_cap[term.upper()] = value

    # Check if an entity appears more than once in the corpus it's being added to the index
    entities_idxs_prefix = config.get_save_files_dir() + "/tmp/entities_idx_"
    for i in range(files_num):
        current_entities = utils.load_obj(entities_idxs_prefix + str(i))
        for term, apperances in current_entities.items():
            if apperances > 1:
                merged_index_after_cap[term] = apperances

    total_terms = len(merged_index)
    #print("Total num of terms: {}".format(total_terms))
    # Save the merged index to disk
    saving_dir = config.get_save_files_dir()
    utils.save_obj(merged_index_after_cap, saving_dir + "/inverted_index")

    return total_terms
コード例 #26
0
def clearSingleEntities(inv_index, parser, output_path, num_of_docs_in_corpus):
    """
    :param inv_index: inv_index
    :param parser: parser
    :param output_path: output_path
    :param num_of_docs_in_corpus: #docs in corpus
    :return:
    """
    EntitiesDict = {}  #{doc_id: [term1,term2]}
    docs_to_clear = {}  # {pkl_id: [doc1 ,doc2]}
    vectorsDict = {}  # {doc_id: normalized vectors}
    # for each term in inv_index, check if should be cleared up.
    # term will be cleared if it is a single entity or term in whole corpus.
    for term in inv_index.keys():
        if inv_index[term][0] == 1:
            single_doc = inv_index[term][1][0]
            if single_doc in EntitiesDict.keys():
                EntitiesDict[single_doc].append(term)
            else:
                EntitiesDict[single_doc] = [term]
    # if there's no entities to remove, return.
    if len(EntitiesDict.keys()) == 0:
        return
    sorted_keys = sorted(EntitiesDict.keys())  # all docs to clear
    key_num = int(sorted_keys[0] / indexer.postingSize)
    docs_to_clear[key_num] = []
    for doc_id in sorted_keys:
        if doc_id >= (
                key_num + 1
        ) * indexer.postingSize:  # should get new data, update key_num
            key_num = int(doc_id / indexer.postingSize)
            docs_to_clear[key_num] = [doc_id]
        else:
            docs_to_clear[key_num] += [doc_id]

    for pkl_key in docs_to_clear.keys():
        data = utils.load_obj(output_path + '/PostingFiles/' + str(pkl_key))
        for doc_id in data.keys():  #key is now a string
            doc_idint = int(doc_id)
            if doc_idint in EntitiesDict.keys():
                for entity in EntitiesDict[doc_idint]:
                    if len(data[doc_id][3]) >= 5 or parser.isEntity(entity):
                        data[doc_id][1] -= data[doc_id][3][entity]
                        data[doc_id][3].pop(entity)
                        inv_index.pop(entity)
            values = data[doc_id][3].values()
            if len(values) != 0:
                data[doc_id][2] = max(values)
            else:
                data[doc_id][2] = 0
            updateVectorsFile(doc_id, data[doc_id], inv_index,
                              num_of_docs_in_corpus, vectorsDict)
        utils.save_obj(data, output_path + '/PostingFiles/' + str(pkl_key))
    utils.save_obj(vectorsDict, output_path + '/PostingFiles/vectorsFile')
    vectorsDict.clear()
    docs_to_clear.clear()
    EntitiesDict.clear()
コード例 #27
0
 def save_index(self, fn):
     """
     Saves a pre-computed index (or indices) so we can save our work.
     Input:
           fn - file name of pickled index.
     """
     # raise NotImplementedError
     indexer = (self.inverted_idx, self.postingDict, self.documents_dict)
     utils.save_obj(indexer, fn)
コード例 #28
0
ファイル: demo_utils.py プロジェクト: majunfu/ROMP
def save_meshes(reorganize_idx, outputs, output_dir, smpl_faces):
    vids_org = np.unique(reorganize_idx)
    for idx, vid in enumerate(vids_org):
        verts_vids = np.where(reorganize_idx==vid)[0]
        img_path = outputs['meta_data']['imgpath'][verts_vids[0]]
        obj_name = os.path.join(output_dir, '{}'.format(os.path.basename(img_path))).replace('.mp4','').replace('.jpg','').replace('.png','')+'.obj'
        for subject_idx, batch_idx in enumerate(verts_vids):
            save_obj(outputs['verts'][batch_idx].detach().cpu().numpy().astype(np.float16), \
                smpl_faces,obj_name.replace('.obj', '_{}.obj'.format(subject_idx)))
コード例 #29
0
def compute_embeddings(args):

    # Load pre-trained model
    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)
    model = BertModel.from_pretrained(args.bert_model)
    model.to(args.device)
    model.eval()

    # Prepare data
    allreasons = read_data(args)
    features = convert_to_features(allreasons, tokenizer, args.max_seq_length)
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_ids = torch.tensor([f.instance_id for f in features], dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                              all_ids)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Run prediction for full data
    logger.info("***** Extracting pre-trained BERT embeddings *****")
    logger.info("Num samples = %d", len(allreasons))
    for batch_idx, inputs in enumerate(eval_dataloader):

        # Send data to GPU
        input_ids = inputs[0].to(args.device)
        input_mask = inputs[1].to(args.device)
        segment_ids = inputs[2].to(args.device)

        #  Apply model
        with torch.no_grad():
            _, cls_output = model(input_ids,
                                  segment_ids,
                                  input_mask,
                                  output_all_encoded_layers=False)

        # Save embeddings
        if batch_idx == 0:
            embeddings = cls_output.data.cpu().numpy()
        else:
            embeddings = np.concatenate(
                (embeddings, cls_output.data.cpu().numpy()), axis=0)

    # Save in a file
    logger.info('Embeddings shape %s' % str(embeddings.shape))
    fileout = os.path.join(args.data_dir, args.embsfile)
    if not os.path.exists(os.path.dirname(fileout)):
        os.mkdir(os.path.dirname(fileout))
    utils.save_obj(embeddings, fileout)
    logger.info('Embeddings saved into %s' % fileout)
コード例 #30
0
ファイル: indexer.py プロジェクト: yairch/Search_Engine
    def write_batch_postings(self):
        """
        Writes all partial posting files in current batch to disk in .pkl form
        """

        for posting_batch_pointer, posting_batch in self.postingDict.items():
            utils.save_obj(
                posting_batch,
                self.output_path + "{}".format(posting_batch_pointer))
コード例 #31
0
ファイル: lifo_cache.py プロジェクト: Aubreymcfato/phetools
    def set(self, filename, data):
        with self._disk_lock:
            if type(filename) == types.UnicodeType:
                filename = filename.encode('utf-8')
            self.disk_write_count += 1
            if filename in self.disk_cache:
                del self.disk_cache[filename]
            if len(self.disk_cache) == self.disk_cache_size:
                old_filename = self.disk_cache.popitem(last = False)[0]
                os.unlink(self.disk_cache_dir + old_filename)

            self.disk_cache[filename] = True
            utils.save_obj(self.disk_cache_dir + filename, data)
            super(LifoCache, self).set(filename, data)
コード例 #32
0
ファイル: metric.py プロジェクト: waldol1/formCluster
	def draw_centers(self):
		# make the appropriate directory
		dir_name = os.path.join(_output_dir, str(datetime.datetime.now()).replace(' ', '_') + "_".join(sys.argv[1:]))
		try:
			os.mkdir(dir_name)
		except:
			pass
		
		# write the arguments so we know what we did
		f = open(os.path.join(dir_name, "args.txt"), 'w')
		f.write(repr(sys.argv))
		f.close()

		# save clusters to file
		utils.save_obj(self.confirm, os.path.join(dir_name, "confirm.pkl"))		

		# draw each cluster center
		for cluster in self.clusters:
			im = cluster.center.draw()
			im.save(os.path.join(dir_name, "cluster_%d.png" % cluster._id))
コード例 #33
0
ファイル: modernization.py プロジェクト: WeftWiki/phetools
 def save_blacklist(self, blacklist):
     result = self.load_blacklist()
     for s in blacklist:
         result.add(s.split(u':')[0].strip())
     filename = self.blacklist_filename()
     utils.save_obj(filename, result)
コード例 #34
0
ファイル: modernization.py プロジェクト: WeftWiki/phetools
 def save_dicts(self, variant, cache):
     filename = self.cache_filename(variant)
     utils.save_obj(filename, cache)
コード例 #35
0
ファイル: job_queue.py プロジェクト: phil-el/phetools
 def save(self, filename):
     # no need of a lock here.
     items = self.copy_items()
     utils.save_obj(filename, items)
コード例 #36
0
 def close(self):
     self.fd_data.close()
     utils.save_obj(self.filename + '.index', self.index)