def __init__(self, path, tokenizer, search, write):
        start = time.time()

        cr = CorpusReader(path, tokenizer)
        cr.processFile()

        #performance metrics
        print("Index time: {:.2f} seconds".format(time.time() - start))
        size = os.stat(path + '.bin').st_size
        print("Index Size on disk :", sizeof_fmt(size))
        words = cr.index.keys()
        metrics(cr.indexer)

        print(
            f"Vocabulary: {len(words)} words, size: {sizeof_fmt(len(''.join(words)))}"
        )
        if search != '':
            print(cr.indexer.search(search))
        if write:
            cr.indexer.writeIndexToFile(f"{path}_indexer.txt")
Esempio n. 2
0
from CorpusReader import CorpusReader
from utils import *
from stanford import CoreNLP
from dependency_tree import build_tree
from dependency_similarity import *
from feature import *
from NeuralLearner import *
import torch
import torch.nn as nn
import numpy as np
import sys
import xgboost as xgb

# data pre-processing
if False:
    reader = CorpusReader('data/test-set.txt')

    train_data = reader.data()

    for _, item in train_data.items():
        item['token1'] = tokenized_sentence(item['Sentence1'])
        item['token2'] = tokenized_sentence(item['Sentence2'])

    corenlp = CoreNLP(sys.argv)
    corenlp.start_server()
    for k, item in train_data.items():
        print(k)
        item['d-tree1'] = corenlp.dependency_parse_tree(
            list_to_string(item['token1']))
        item['d-tree2'] = corenlp.dependency_parse_tree(
            list_to_string(item['token2']))
Esempio n. 3
0
            "WARNING: You have a CUDA device, so you should probably run with --gpu"
        )
else:
    if args.gpu:
        print(
            "You do not have a GPU device, so you should run CPU without --gpu option."
        )
        exit()

if 'word' not in args.features_level and 'character' not in args.features_level:
    exit(
        "features_level argument is empty. It should include at least one of [word,character] items."
    )

torch.manual_seed(args.seed)
corpus_train_reader = CorpusReader(args.corpus_train_file, 1000000)  # 100MB
corpus_dev_reader = CorpusReader(args.corpus_dev_file, 1000000)  # 100MB

dictionary_word = DictionaryWord()
dictionary_char = None

if 'character' in args.features_level:
    dictionary_char = DictionaryCharacter()

model = None
# Load the pre-trained Model for fine-tuning
if path.exists(args.output_model_path):

    print("Loading pre-trained Model...")
    model = LanguageModel.load_model(
        use_gpu=args.gpu, path_to_pretrained_model=args.output_model_path)
Esempio n. 4
0
import pandas as pd
from CorpusReader import CorpusReader

import os

file_dir = os.path.dirname(os.path.abspath(__file__))
train_file_path = os.path.join(file_dir, 'train_df.csv')
test_file_path = os.path.join(file_dir, 'test_df.csv')

train_cr = CorpusReader('./dataset/semeval_train.txt')
train_features = train_cr.feature_extract()

test_cr = CorpusReader('./dataset/semeval_test.txt')
test_features = test_cr.feature_extract()

train_df = pd.DataFrame([t.__dict__ for t in train_features])
train_df.to_csv(train_file_path, index=False, header=True)

test_df = pd.DataFrame([t.__dict__ for t in test_features])
test_df.to_csv(test_file_path, index=False, header=True)

from levenshtein import levenshtein as lev
from collections import Counter
from pdb import set_trace
from Header import Header

header = Header()
SUFF = ('eas', 'is', 'ais', 'as', 'eamar', 'amar', 'eabhair', 'abhair',
        'ead    ar', 'adar')

books = False
sentences = True

print "Loading Corpora..."
if books:
    print "\tloading munster"
    M = CorpusReader('munster')
    print "\tloading connacht"
    C = CorpusReader('connacht', M.countBooks())
    C.truncateBooks(M.countBooks())
    print "\tloading ulster"
    U = CorpusReader('ulster', M.countBooks())
    U.truncateBooks(M.countBooks())
    l = [U, M, C]
#print "Done."
if sentences:
    print "Creating Balanced Set of sentences"
    M = CorpusReader('munster')
    C = CorpusReader('connacht')
    U = CorpusReader('ulster')
    l = [U, M, C]
    MIN_LENG = min([x.countSentences() for x in l])
Esempio n. 6
0
 def LoadCorpus(self):
     self.cr=CorpusReader(self.readCorpusFilePath)
     id,corpu=self.cr.getCorpu(1)
     self.UpdataCorpu(id,corpu)
Esempio n. 7
0
    print('')
    print('-printing each lemma-')
    nlpPipeLine.createLemma(nlp, sentTest)

    print('')
    print('-printing each POS tag-')
    nlpPipeLine.createPOS(nlp, sentTest)

    print('')
    print('-printing all Dependency parse tree-')
    nlpPipeLine.createDepParse(nlp,sentTest)


    data_folder_train = Path("data/train-set.txt")
    trainCorpusObject = CorpusReader(data_folder_train)

    data_folder_test = Path("data/dev-set.txt")
    devCorpusObject = CorpusReader(data_folder_test)


    mlObject = MachineLearningTasks(trainCorpusObject, devCorpusObject)

    #do the nlp pipeline for each parah in corpusObject
    #store in the appropriate HashMap dict

    """a = 0
    for corpusParah in trainCorpusObject.corpus:
        #doc1 = nlp(corpusParah.hm1["sent"])
        #doc2 = nlp(corpusParah.hm2["sent"])
        if(a==2):