def __init__(self, datasetFile, textDir, checking_folder, lang, client_txt, pre_trained_gen, pre_trained_disc, ID, batch_size=1): self.generator = torch.nn.DataParallel( gan_factory.generator_factory('gan').cuda()) self.generator.load_state_dict(torch.load(pre_trained_gen)) self.discriminator = torch.nn.DataParallel( gan_factory.discriminator_factory('gan').cuda()) self.discriminator.load_state_dict(torch.load(pre_trained_disc)) self.checking_folder = checking_folder self.lang = lang self.client_txt = client_txt self.filename = ID self.batch_size = batch_size cl = CorpusLoader(datasetFile=datasetFile, textDir=textDir) self.vectorizer = cl.TrainVocab()
def load_corpus(self, name, files, min=15, max=100, merge=False): ''' :param name: key for dictionary entry in self.corpora :param files: list of files :param min, max: min and max length of sentences :param merge: one or two text elements. one if true :return: None ''' CL = CorpusLoader(files[0], min, max) if len(files) > 1: iterfiles = iter(files) next(iterfiles) for file in iterfiles: CL.add_Corpus(file, min, max) if merge: CL.mergeData() CL.containing.append(name) CL.tokenize() corpus = self.tax.expandTax(CL) self.corpora[name] = corpus
def load_corpus(name, files, merge=True): CL = CorpusLoader(files[0], min, max) CL.add_Corpus(files[1], min, max) if merge: CL.mergeData() corpora[name] = CL print(name + " loaded...")
def loadData(): file = "data/corpus/Metalogue_extractedLinks_fullCorpus.txt" file2 = "data/corpus/Metalogue_Corpus_NegativePhrases.txt" file3 = "data/corpus/IBM_extracted_raw.txt" CL = CorpusLoader() CL.load(file3) #CL.add_Corpus(file2) #CL.mergeLabel("justification", "evidence", "contingency") CL.stats(CL.data) print("DONE") return CL.data
def __init__(self, datasetFile, imagesDir, textDir, split, arrangement, sampling): self.datasetFile = datasetFile self.imagesDir = imagesDir self.textDir = textDir self.split = split self.arrangement = easydict.EasyDict(arrangement) self.sampling = easydict.EasyDict(sampling) self.images_classes = {} self.assign_classes() cl = CorpusLoader(datasetFile=datasetFile, textDir=textDir) self.vectorizer = cl.TrainVocab()
def mergeCorpora(self, corpora): ''' merges the corpora into one new CL object :param corpora: list of self.corpora keys :return: CL ''' merge = [] CL = CorpusLoader() for corpus in corpora: merge.append(self.corpora[corpus]) CL.containing.append(corpus) CL.mergeWithCorpus(merge) return CL
def __init__(self): self.corpusLoader = CorpusLoader() self.corpus = self.corpusLoader.load_corpus() self.input_sentences = list(self.corpus.keys()) logging.debug(pformat(self.corpus)) self.lemmer = WordNetLemmatizer() self.tfIdfVec = TfidfVectorizer(tokenizer=self.tokenize) self.similarity_threshold = 0.30 # Keyword Matching self.GREETING_INPUTS = ( "hello", "hi", "greetings", "sup", "what's up", "hey", ) self.GREETING_RESPONSES = ("hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me")