def __init__(self, path, tokenizer, search, write): start = time.time() cr = CorpusReader(path, tokenizer) cr.processFile() #performance metrics print("Index time: {:.2f} seconds".format(time.time() - start)) size = os.stat(path + '.bin').st_size print("Index Size on disk :", sizeof_fmt(size)) words = cr.index.keys() metrics(cr.indexer) print( f"Vocabulary: {len(words)} words, size: {sizeof_fmt(len(''.join(words)))}" ) if search != '': print(cr.indexer.search(search)) if write: cr.indexer.writeIndexToFile(f"{path}_indexer.txt")
from CorpusReader import CorpusReader from utils import * from stanford import CoreNLP from dependency_tree import build_tree from dependency_similarity import * from feature import * from NeuralLearner import * import torch import torch.nn as nn import numpy as np import sys import xgboost as xgb # data pre-processing if False: reader = CorpusReader('data/test-set.txt') train_data = reader.data() for _, item in train_data.items(): item['token1'] = tokenized_sentence(item['Sentence1']) item['token2'] = tokenized_sentence(item['Sentence2']) corenlp = CoreNLP(sys.argv) corenlp.start_server() for k, item in train_data.items(): print(k) item['d-tree1'] = corenlp.dependency_parse_tree( list_to_string(item['token1'])) item['d-tree2'] = corenlp.dependency_parse_tree( list_to_string(item['token2']))
"WARNING: You have a CUDA device, so you should probably run with --gpu" ) else: if args.gpu: print( "You do not have a GPU device, so you should run CPU without --gpu option." ) exit() if 'word' not in args.features_level and 'character' not in args.features_level: exit( "features_level argument is empty. It should include at least one of [word,character] items." ) torch.manual_seed(args.seed) corpus_train_reader = CorpusReader(args.corpus_train_file, 1000000) # 100MB corpus_dev_reader = CorpusReader(args.corpus_dev_file, 1000000) # 100MB dictionary_word = DictionaryWord() dictionary_char = None if 'character' in args.features_level: dictionary_char = DictionaryCharacter() model = None # Load the pre-trained Model for fine-tuning if path.exists(args.output_model_path): print("Loading pre-trained Model...") model = LanguageModel.load_model( use_gpu=args.gpu, path_to_pretrained_model=args.output_model_path)
import pandas as pd from CorpusReader import CorpusReader import os file_dir = os.path.dirname(os.path.abspath(__file__)) train_file_path = os.path.join(file_dir, 'train_df.csv') test_file_path = os.path.join(file_dir, 'test_df.csv') train_cr = CorpusReader('./dataset/semeval_train.txt') train_features = train_cr.feature_extract() test_cr = CorpusReader('./dataset/semeval_test.txt') test_features = test_cr.feature_extract() train_df = pd.DataFrame([t.__dict__ for t in train_features]) train_df.to_csv(train_file_path, index=False, header=True) test_df = pd.DataFrame([t.__dict__ for t in test_features]) test_df.to_csv(test_file_path, index=False, header=True)
from levenshtein import levenshtein as lev from collections import Counter from pdb import set_trace from Header import Header header = Header() SUFF = ('eas', 'is', 'ais', 'as', 'eamar', 'amar', 'eabhair', 'abhair', 'ead ar', 'adar') books = False sentences = True print "Loading Corpora..." if books: print "\tloading munster" M = CorpusReader('munster') print "\tloading connacht" C = CorpusReader('connacht', M.countBooks()) C.truncateBooks(M.countBooks()) print "\tloading ulster" U = CorpusReader('ulster', M.countBooks()) U.truncateBooks(M.countBooks()) l = [U, M, C] #print "Done." if sentences: print "Creating Balanced Set of sentences" M = CorpusReader('munster') C = CorpusReader('connacht') U = CorpusReader('ulster') l = [U, M, C] MIN_LENG = min([x.countSentences() for x in l])
def LoadCorpus(self): self.cr=CorpusReader(self.readCorpusFilePath) id,corpu=self.cr.getCorpu(1) self.UpdataCorpu(id,corpu)
print('') print('-printing each lemma-') nlpPipeLine.createLemma(nlp, sentTest) print('') print('-printing each POS tag-') nlpPipeLine.createPOS(nlp, sentTest) print('') print('-printing all Dependency parse tree-') nlpPipeLine.createDepParse(nlp,sentTest) data_folder_train = Path("data/train-set.txt") trainCorpusObject = CorpusReader(data_folder_train) data_folder_test = Path("data/dev-set.txt") devCorpusObject = CorpusReader(data_folder_test) mlObject = MachineLearningTasks(trainCorpusObject, devCorpusObject) #do the nlp pipeline for each parah in corpusObject #store in the appropriate HashMap dict """a = 0 for corpusParah in trainCorpusObject.corpus: #doc1 = nlp(corpusParah.hm1["sent"]) #doc2 = nlp(corpusParah.hm2["sent"]) if(a==2):