def indexit(tokenizer, filenames): indexer = Indexer(used_tokenizer) for filename in filenames: corpus_reader = CorpusReader(filename) indexer.index(corpus_reader) indexer.sort() return indexer
def main(): path = os.path.join('..', '..', 'dataset', 'eRISK2020_T1_training_data', 'td') print("Creating Corpus Reader for training") corpus_reader_train = CorpusReader(path) corpus_reader_train.load() print("Corpus Reader for training created") path = os.path.join('..', '..', 'dataset', 'T1_test_data', 'td') gt_name = 'T1_erisk_golden_truth.txt' corpus_reader_test = CorpusReader(path, gt_name) corpus_reader_test.load() all_texts = [ ''.join(map(lambda x: str(x), subject.posts)) for subject in corpus_reader_train.subjects ] all_gt = [subject.gt for subject in corpus_reader_train.subjects] count_vectorizer = CountVectorizer(analyzer='word', token_pattern=r'\w+', ngram_range=(1, 2)) bow = dict() bow["train"] = (count_vectorizer.fit_transform(all_texts), all_gt) lr_classifier = LogisticRegression(solver='liblinear') lr_classifier.fit(*bow["train"]) matrix = Matrix(len(corpus_reader_test.subjects), corpus_reader_test.subjects) args = {'matrix': matrix, 'vec': count_vectorizer, 'class': lr_classifier} matrix = run_simulation(args) print(matrix) # analyze results precision = measures.calc_precision(corpus_reader_test.subjects, matrix) recall = measures.calc_recall(corpus_reader_test.subjects, matrix) f1 = measures.calc_f1(precision, recall) ERDE = measures.calc_ERDE(corpus_reader_test.subjects, matrix)
def indexit(tokenizer, filenames, store_positions=False, calculate_tfidf=False, memory_usage=20): index = Index(tokenizer, store_positions) indexer = Indexer(index, 'index', max_memory_usage=memory_usage) for filename in filenames: indexer.index(CorpusReader(filename)) indexer.merge(calculate_tfidf) return index
def indexit(tokenizer, filenames, store_positions=False, calculate_tfidf=False, memory_usage=20): indexer = Indexer(tokenizer, 'indexer', store_positions=store_positions, max_memory_usage=memory_usage) for filename in filenames: corpus_reader = CorpusReader(filename) indexer.index(corpus_reader) indexer.merge(calculate_tfidf) return indexer
def main(): args = parse_args() r = CorpusReader(accent_map=args.accents, filter_punct=args.filter_punct, lower=args.lower) featdict, labels = r.get_featdict_from_lines(stdin, window=args.window) vec = DictVectorizer() X = vec.fit_transform(featdict).toarray() y, label_d = convert_labels(labels) cnt = defaultdict(int) # for l in y: # cnt[label_d[l]] += 1 # for k, v in cnt.iteritems(): # print('{0} {1}'.format(k.encode('utf8'), v)) #print label_d #print(vec.fit_transform(featdict).toarray()) #print vec.get_feature_names() run_pipeline(X, y)
def __init__( self, files=[], directories=[], skip=[], unigram_dictionary=None, noise_ratio=15, kernel=[1,2,3,4,5,5,4,3,2,1], t = 1.0e-5, batch_size = 1000, parse=default_parse, verbose=True ): # Get a corpus reader self.corpus_reader = CorpusReader( files=files, directories=directories, skip=skip, parse=parse, verbose=verbose ) # Load the unigram_dictionary if unigram_dictionary is not None: self.unigram_dictionary = unigram_dictionary else: self.unigram_dictionary = UnigramDictionary() self.noise_ratio = noise_ratio self.kernel = kernel self.t = t self.batch_size = batch_size # Validate the kernel. It should reflect the relative # frequencies of choosing tokens from a window of +/- K tokens # relative to a query token. So it must have an even number of # entries if not len(self.kernel) % 2 == 0: raise ValueError( 'kernel should reflect the relative frequencies of ' 'selecting a context token within +/- K of the query ' 'token, and so should have an equal number of entries ' 'defining frequencies to the left and right of the query ' 'token, and so should have an even number of entries.' )
def run(self): corpus_reader = CorpusReader(self.path) corpus_reader.load() analyser = SentimentIntensityAnalyzer() num_subs = len(corpus_reader.subjects) for i, sub in enumerate(corpus_reader.subjects): print(f"Number os subjects left : {num_subs - i}") for post in sub.posts: score = analyser.polarity_scores(str(post)) s = score['compound'] if abs(s) > self.threshold: string = spplit(str(post)) for j in range(3): for i in range(len(string) - j): score_word = analyser.polarity_scores(' '.join( string[i:(i + j)])) word_compound = score_word['compound'] if abs(word_compound) > self.threshold: if string[i] not in self.imp_words: self.imp_words.append(' '.join( string[i:(i + j)]))
#mod3 = pickle.load(open(MODEL3_NAME, 'rb')) #mod4 = pickle.load(open(MODEL4_NAME, 'rb')) #mod5 = pickle.load(open(MODEL5_NAME, 'rb')) #mod6 = pickle.load(open(MODEL6_NAME, 'rb')) #mod7 = pickle.load(open(MODEL7_NAME, 'rb')) #device = torch.device("cuda") #no_vader.to(device) path = os.path.join( '..', 'data', 'erisk-2021-t2') #path = os.path.join( '..', '..', 'dataset', 'T1_test_data', 'test') gt_name = 'golden_truth.txt' corpus_reader_test = CorpusReader(path) corpus_reader_test.load() with open("file.txt", 'w') as f: for sub in corpus_reader_test.subjects: f.write("{} - {}\n".format(sub.id, sub.gt)) filename = "RESULTS_TEST_more_model3_no_token_param.txt" #clean file with open(filename, 'w') as file: pass # find the greatest number of posts posts_max = max([ len(s.posts) for s in corpus_reader_test.subjects ]) print(posts_max)
def __init__( self, max_len=50, # Maximum sentence length, same for questions, answers and reviews num_reviews=20, # Number of review candidates for each QA pair selftest=False, if_only_top_ans=True, top_score_recorder=None, load_meta=True, load_vocab=True, load_qa=True, load_review=True, load_word_embedding=True): try: # if not selftest: # filename = os.path.join(DATA_PATH, 'datautil.pickle') # else: # filename = os.path.join(DATA_PATH, 'datautil-selftest.pickle') # logger.info('Loading stored data from {} ...'.format(filename)) # with open(filename, 'rb') as f: # tmp_dict = pickle.load(f) # self.__dict__.clear() # self.__dict__.update(tmp_dict) self.selftest = selftest if load_meta: self._load_meta() if load_vocab: self._load_vocab() if load_qa: self._load_qa() if load_review: self._load_review() if load_word_embedding: self._load_word_embedding() except IOError: logger.info('Stored data not found, preprocessing ...') self.selftest = selftest self.max_len = max_len self.num_reviews = num_reviews logger.info('Initializing CorpusReader ...') corpusreader = CorpusReader( maxline=SELF_TEST_MAX_LINE if selftest else -1, num_reviews=(5 * self.num_reviews), if_only_top_ans=if_only_top_ans, load_glove=False if selftest else True) self.vocab_size = corpusreader.vocab_size self.num_pos_tags = corpusreader.num_pos_tags self.embed_matrix = corpusreader.embed_matrix self.w_embed_size = corpusreader.w_embed_size self.word2id = corpusreader.word2id self.id2word = corpusreader.id2word self.id2freq = corpusreader.id2freq self.pos2id = corpusreader.pos2id self.id2pos = corpusreader.id2pos logger.info('Read corpus data and convert to arrays ...') data, review_data, asin2id = self._read_into_arrays( corpusreader=corpusreader, if_only_top_ans=if_only_top_ans) self.review_data = review_data del corpusreader del review_data gc.collect() logger.info('Calculate review IDF ...') self.review_idf = self._get_review_idf() logger.info('Splitting data into train, dev, test sets ...') self._train_idx, self._dev_idx, self._test_idx = [], [], [] self._train_size, self._dev_size, self._test_size = 0, 0, 0 self._data_split(data) del data gc.collect() # logger.info('Storing into {}...'.format(filename)) # with open(filename, 'wb') as f: # pickle.dump(self.__dict__, f) self._save_meta() self._save_vocab() self._save_qa() self._save_review() self._save_word_embedding() self._block_to_dense() self.top_score_recorder = top_score_recorder if self.top_score_recorder is not None: logger.info("Train with Pseudo Relevance Feedbacks") self._print_info()
from corpus_reader import CorpusReader from preprocess import PreProcess from tf_idf import TfIdf from knn import KNN from metrics import MetricsGenerator from pprint import pprint as pp if __name__ == '__main__': print('reading...') reader = CorpusReader() reader.run() parser = PreProcess() parsed_trainning_documents = {} print('processing...') for k, v in reader.train.items(): parsed_trainning_documents[k] = parser.process(v) # Entrada para o tf-idf, devemos anotar os documentos com suas classes. # Receberá como entrada um array de tuplas: ([tokens], classe) parsed_trainning_documents_with_classes = [] for k in parsed_trainning_documents.keys(): parsed_trainning_documents_with_classes += [(v, k) for v in parsed_trainning_documents[k]] # Execução tf-idf print('generating tf.idf...') tf_idf_calculator = TfIdf(parsed_trainning_documents_with_classes) tf_idf_calculator.run() # testa os parâmetros do knn: métrica de distância e valor de K for metric in ['cosine', 'euclid']:
__author__ = 'rwechsler' import gensim import sys import glob from corpus_reader import CorpusReader files = glob.glob(sys.argv[1]) outfile_name = sys.argv[2] dataset = CorpusReader(files) model = gensim.models.Word2Vec(dataset, size=500, window=5, min_count=3, negative=5, workers=2) model.save(outfile_name)
def train6(): with open("log.txt", 'w') as f: pass #path1 = os.path.join( '..', '..', 'dataset', 'eRISK2020_T1_training_data', 'train') #path1 = os.path.join( '..', 'data', 'erisk-2021-t2', 'td') path1 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRISK2020_T1_training_data', 'eRISK2020_T1_training_data', 'eRISK2020_training_data') path2 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRisk2020_T1_test_data', 'eRisk2020_T1_test_data', 'T1') print("Creating Corpus Reader for training") corpus_reader_train = CorpusReader(path1) corpus_reader_train.load() print("Corpus Reader for training created") corpus_reader_test = CorpusReader(path2) corpus_reader_test.load() print("Corpus Reader for testing created") emo = Emojis() token = Token() """ set the tokenizer and model parameters """ #tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") #bert_model = BertModel.from_pretrained("bert-base-uncased") bert_model = SentenceTransformer('paraphrase-mpnet-base-v2') #device = torch.device("cuda") #bert_model.to(device) # create the bert bert_transformer = BigBird(bert_model) sentiment = Sentiment() """ training the model """ print("Initializing Training") #n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0 parameters = { 'classifier__n_estimators':[50, 100, 500, 1000], 'classifier__learning_rate' : [ 0.001, 0.01, 0.1, 1.0], 'classifier__max_depth' : [1, 3, 5, 10]} classifier = GradientBoostingClassifier() model = Pipeline( [ ('emojis', emo), #('tokenizer', token), ('union', FeatureUnion(transformer_list = [ ("vectorizer", bert_transformer), ("sentiment", sentiment), ])), ("classifier", classifier), ] ) clf = GridSearchCV(model, parameters) batch_size = 40 num_users = len(corpus_reader_train.subjects) #print(num_users) for j in range(50, 2000, 50): count = 0 all_texts = list() all_gt = list() for i in range(0, num_users, batch_size): #print(i) all_texts.append([ subject.posts[0:j] for subject in corpus_reader_train.subjects[(batch_size * count) : (batch_size * (count + 1))] ]) all_gt.append([ subject.gt for subject in corpus_reader_train.subjects[(batch_size * count) : (batch_size * (count + 1))] ]) count += 1 print(all_gt[0]) for i in range(len(all_texts)): clf.fit(all_texts[i], all_gt[i]) num_users = len(corpus_reader_test.subjects) #print(num_users) for j in range(50, 2000, 50): all_texts = list() all_gt = list() count = 0 for i in range(0, num_users, batch_size): print(i) all_texts.append([ subject.posts[0:j] for subject in corpus_reader_test.subjects[(batch_size * count) : (batch_size * (count + 1))] ]) all_gt.append([ subject.gt for subject in corpus_reader_test.subjects[(batch_size * count) : (batch_size * (count + 1))] ]) count += 1 for i in range(len(all_texts)): clf.fit(all_texts[i], all_gt[i]) print("End of training") return clf
def get_input_option(prompt, options): res = input(prompt + " (" + "/".join(options) + ") ") while res not in options: res = input("pardon? (" + "/".join(options) + ") ") return res if __name__ == '__main__': arg_parser = argparse.ArgumentParser(description='Corpus Filter') arg_parser.add_argument('corpus_file', help='path to the corpus file') arg_parser.add_argument('output_prefix', help='path to the output files') args = arg_parser.parse_args() print('\n - Filtering Corpus -\n') corpus = CorpusReader(args.corpus_file) file_output_pos = open(args.output_prefix + '.pos', 'w', encoding='utf8') file_output_neg = open(args.output_prefix + '.neg', 'w', encoding='utf8') file_output_fav = open(args.output_prefix + '.fav', 'w', encoding='utf8') for tweet in corpus.text_json(): tweet = tweet.replace('\n', ' ') tweet = tweet.strip() print('"' + tweet + '"') action = get_input_option('sarcasm detected?', ['y', 'n', 'f', 'q']) if action == 'f': file_output_fav.write(tweet + '\n') action = get_input_option('faved, but is there sarcasm?', ['y', 'n', 'q']) if action == 'y': file_output_pos.write(tweet + '\n')
queries = [] if args.query: queries.append(args.query) if args.file: with open(args.file, 'r') as fin: queries.extend([line.strip().split('\t')[1] for line in fin]) result = {} queries = { query: collections.OrderedDict( itertools.islice(ranker.rank(query).items(), 10)) for query in queries } for filename in filenames: for pmid, document in CorpusReader(filename).items(): toremove = list() for query, scores in queries.items(): score = scores.pop(pmid, None) if score is not None: if len(scores) == 0: toremove.append(query) result_scores = result.setdefault(query, []) result_scores.append((document, score)) for query in toremove: queries.pop(query) if len(queries) == 0: break else: # Continues if the inner loop DIDN'T break! continue break
def Main(): while True: # Display menu options DisplayMenu() op = raw_input("\nOption > ") if not op.isdigit() and int(op) in [0,1,2,3,4,5,6]: print "Opcion invalida" continue op = int(op) if op == 0: # Exit break else: # Read the parameters parameter = parameters[op-1] name = parameters[op-1][0].split("/")[-1] corpus = CorpusReader( parameter[0], parameter[1], parameter[2], parameter[3], parameter[4], category_position=parameter[5], category_level=parameter[6], start=parameter[7], decoding=parameter[8], ) try: # Get reviews and shuffle them reviews = list(enumerate(corpus.get_opinions())) # TODO: Cambia por lectura de BD op = raw_input("\nInsert IDs separated by ',' or <intro> for pick up randomly > ") if op: # From indexes indexes = [int(i) for i in op.split(',')] indexes = set(indexes) # Ensure no duplicated indexes = list(indexes) # Transform left = len(indexes) else: # Randomly while not op.isdigit(): op = raw_input("How many? > ") left = int(op) indexes = range(len(reviews)) random.shuffle(indexes) indexes = indexes[:left] reviews = [(i,review) for (i,review) in reviews if i in indexes] result = [] # Tag every review while left != 0: # Start id,review = reviews[left-1] words = review.split(' ') total = len(words) cats = [' ' for _ in range(total)] # For each word annotate with (N) or (I) and give the possibility of back by pressing (B) cat = "" idx = 0 while True: # Display review DisplayReview(id,idx,total,words,cats) # Check end condition if idx == total: op = raw_input("\nDone. Proceed with the next review (left %i)? [y/n] > " % (left-1)) if op == 'y': break idx = idx - 1 if idx != 0 else 0 cats[idx] = ' ' continue # Ask for input tooltip = "\nTag with N(ormal) or I(nverted). " tooltip += "Enter A(bort), B(ack) or <intro> for " tooltip += "repeating last action (%s) > " % (cat.upper() if cat else "None") tag = raw_input(tooltip) if not tag and not cat: # Prevents parse empty cat print "Input a category first";raw_input() continue elif tag: cat = tag # Action from decision cat = cat.lower() if not cat or cat not in 'niba': print "Option",cat,"is not correct." ;raw_input() continue if cat == 'b': # Back idx = idx - 1 if idx != 0 else 0 cats[idx] = ' ' elif cat == 'a': op = raw_input("Are you sure you want to abort (left %i)? [y/n] > " % left) if op.lower() == 'y': raise Exception("Abort") else: # Associate the category cats[idx] = cat idx = idx + 1 # Save the result as two list: words and its respective category for each one result.append({ "id" : id+1, "from" : name, "annotation" : ' '.join(word.lower()+"/"+cat for word,cat in zip(words,cats)) }) # Update left -= 1 # View and save results if op == 0: continue ViewSave(result,name) except Exception as e: content = json.dumps(result,indent=4,ensure_ascii=False) error = "Corpus:%s, Review:%i, Description:%s Partial:%s" % (name,id,str(e),content) log(error) raw_input("Reason: %s\nEnter to cotinue..." % str(e))
def train_model1(classifier): path1 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRISK2020_T1_training_data', 'eRISK2020_T1_training_data', 'eRISK2020_training_data') path2 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRisk2020_T1_test_data', 'eRisk2020_T1_test_data', 'T1') print("Creating Corpus Reader for training") corpus_reader_train = CorpusReader(path1) corpus_reader_train.load() print("Corpus Reader for training created") corpus_reader_test = CorpusReader(path2) corpus_reader_test.load() print("Corpus Reader for testing created") emo = Emojis() token = Token("normal") """ set the tokenizer and model parameters """ #tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") #bert_model = BertModel.from_pretrained("bert-base-uncased") bert_model = SentenceTransformer('paraphrase-mpnet-base-v2') #device = torch.device("cuda") #bert_model.to(device) # create the bert bert_transformer = BigBird(bert_model) sentiment = Sentiment() """ training the model """ print("Initializing Training") #classifier = svm.SVC(C = 1, gamma = 'scale', kernel = 'linear', probability = True) #clf = CalibratedClassifierCV(classifier) #classifier = svm.SVC(C = 1, gamma = 'scale', kernel = 'linear', probability = True) #classifier = AdaBoostClassifier(learning_rate = 0.01, n_estimators = 100) #clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0) model = Pipeline([ ('emojis', emo), ('tokenizer', token), ( 'union', FeatureUnion(transformer_list=[ ("vectorizer", bert_transformer), #("sentiment", sentiment), ])), ("classifier", classifier), ]) batch_size = 40 num_users = len(corpus_reader_train.subjects) #print(num_users) for j in range(50, 2000, 50): count = 0 all_texts = list() all_gt = list() for i in range(0, num_users, batch_size): #print(i) all_texts.append([ subject.posts[0:j] for subject in corpus_reader_train.subjects[( batch_size * count):(batch_size * (count + 1))] ]) all_gt.append([ subject.gt for subject in corpus_reader_train.subjects[( batch_size * count):(batch_size * (count + 1))] ]) count += 1 print(all_gt[0]) for i in range(len(all_texts)): model.fit(all_texts[i], all_gt[i]) num_users = len(corpus_reader_test.subjects) #print(num_users) for j in range(50, 2000, 50): all_texts = list() all_gt = list() count = 0 for i in range(0, num_users, batch_size): print(i) all_texts.append([ subject.posts[0:j] for subject in corpus_reader_test.subjects[( batch_size * count):(batch_size * (count + 1))] ]) all_gt.append([ subject.gt for subject in corpus_reader_test.subjects[( batch_size * count):(batch_size * (count + 1))] ]) count += 1 for i in range(len(all_texts)): model.fit(all_texts[i], all_gt[i]) print("End of training") # Its important to use binary mode dbfile = open(f'model1_{classifier.__class__.__name__}.sav', 'wb') pickle.dump(model, dbfile) return model
("../../corpus/corpus_cine","*.xml","<body>(.*?)</body>","rank=\"(.*?)\"","FILE", "BEFORE",None,0,'utf8',{u'1': 0, u'2': 25, u'3': 50, u'4': 75, u'5': 100}), ("../../corpus/corpus_hoteles","*.xml","<coah:review>(.*?)</coah:review>","<coah:rank>(.*?)</coah:rank>","FILE", "BEFORE",None,0,'utf8',{u'1': 0, u'2': 25, u'3': 50, u'4': 75, u'5': 100}), ("../../corpus/corpus_prensa_uy","*.csv","\"(.*?)\",(?:TRUE|FALSE)",",(.*?)\\n","FILE", "AFTER",None,0,'utf8',{u'Neg': 0, u'Neu': 50, u'Pos': 100}), ("../../corpus/corpus_tweets","*.tsv","(.*?)\\t.*?\\n","(.*?\\t.*?)\\t","FILE", "BEFORE",None,1,'utf8',{u'3\t1': 10, u'3\t2': 20, u'2\t4': 90, u'2\t2': 70, u'2\t3': 60, u'4\t2': 30, u'2\t1': 80, u'5\t1': 40, u'1\t5': 50, u'1\t4': 30, u'4\t1': 50, u'1\t1': 40, u'1\t3': 60, u'1\t2': 70}), ("../../corpus/corpus_variado_sfu","*/*.txt","(.*)\s","(.*?)_","PATH", None,1,0,'utf8',{'no': 0, 'yes': 100}) ] # Read each corpus from corpus_reader import CorpusReader for parameter in parameters: reader = CorpusReader( parameter[0], parameter[1], parameter[2], parameter[3], parameter[4], category_position=parameter[5], category_level=parameter[6], start=parameter[7], decoding=parameter[8], ) fun = parameter[9] data = reader.get_data(lambda x:fun[x])
def train_model4(classifier): path1 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRISK2020_T1_training_data', 'eRISK2020_T1_training_data', 'eRISK2020_training_data') path2 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRisk2020_T1_test_data', 'eRisk2020_T1_test_data', 'T1') print("Creating Corpus Reader for training") corpus_reader_train = CorpusReader(path1) corpus_reader_train.load() print("Corpus Reader for training created") corpus_reader_test = CorpusReader(path2) corpus_reader_test.load() print("Corpus Reader for testing created") emo = Emojis() token = Token("yake") """ set the tokenizer and model parameters """ bert_model = SentenceTransformer('paraphrase-mpnet-base-v2') # create the bert bert_transformer = BigBird(bert_model) sentiment = Sentiment() """ training the model """ print("Initializing Training") model = Pipeline([ ('emojis', emo), ('tokenizer', token), ('union', FeatureUnion(transformer_list=[ ("vectorizer", bert_transformer), ("sentiment", sentiment), ])), ("classifier", classifier), ]) batch_size = 40 num_users = len(corpus_reader_train.subjects) count = 0 all_texts = list() all_gt = list() for i in range(0, num_users, batch_size): all_texts.append([ subject.posts for subject in corpus_reader_train.subjects[(batch_size * count):(batch_size * (count + 1))] ]) all_gt.append([ subject.gt for subject in corpus_reader_train.subjects[(batch_size * count):(batch_size * (count + 1))] ]) count += 1 for i in range(len(all_texts)): model.fit(all_texts[i], all_gt[i]) ''' num_users = len(corpus_reader_test.subjects) all_texts = list() all_gt = list() count = 0 for i in range(0, num_users, batch_size): all_texts.append([ subject.posts for subject in corpus_reader_test.subjects[(batch_size * count) : (batch_size * (count + 1))] ]) all_gt.append([ subject.gt for subject in corpus_reader_test.subjects[(batch_size * count) : (batch_size * (count + 1))] ]) count += 1 for i in range(len(all_texts)): model.fit(all_texts[i], all_gt[i]) ''' print("End of training") # Its important to use binary mode dbfile = open(f'model4_{classifier.__class__.__name__}.sav', 'wb') pickle.dump(model, dbfile) return model
N_COMPONENTS = parameters['N_COMPONENTS'] MODEL_PATH = parameters['MODEL_PATH'] NUM_OF_SAMPLES = parameters['NUM_OF_SAMPLES'] WINDOW_SIZE = parameters['WINDOW_SIZE'] TEST_FOLDER = parameters['TEST_FOLDER'] TEST_FILE = parameters['TEST_FILE'] MODE = parameters['MODE'] if __name__ == '__main__': reader = CorpusReader(DATA_PATH, FOLDER_NAME, NUM_OF_SAMPLES=NUM_OF_SAMPLES) todo_path = os.path.join('bin', FOLDER_NAME+'_todo.json') done_path = os.path.join('bin', FOLDER_NAME + '_done.json') if os.path.exists(todo_path) and os.path.exists(done_path): with open(todo_path, 'r') as todo_f: todo = json.load(todo_f) todo_list = todo['document_ids'] with open(done_path, 'r') as done_f: done = json.load(done_f) done_list = done['document_ids'] assert reader.documents_amount == len(todo_list), "Something wrong within the corpus, please delete 'bin' folder and re-run it." if len(todo_list) != len(done_list): build_elasticsearch(data_path=DATA_PATH, zipfile_name=FOLDER_NAME)