def __init__(self, datasetFile, textDir, checking_folder, lang, client_txt, pre_trained_gen, pre_trained_disc, ID, batch_size=1): self.generator = torch.nn.DataParallel( gan_factory.generator_factory('gan').cuda()) self.generator.load_state_dict(torch.load(pre_trained_gen)) self.discriminator = torch.nn.DataParallel( gan_factory.discriminator_factory('gan').cuda()) self.discriminator.load_state_dict(torch.load(pre_trained_disc)) self.checking_folder = checking_folder self.lang = lang self.client_txt = client_txt self.filename = ID self.batch_size = batch_size cl = CorpusLoader(datasetFile=datasetFile, textDir=textDir) self.vectorizer = cl.TrainVocab()
def load_corpus(name, files, merge = True): CL = CorpusLoader(files[0], min, max) CL.add_Corpus(files[1],min, max) if merge: CL.mergeData() corpora[name] = CL print(name+ " loaded...")
def loadData(): file = "data/corpus/Metalogue_extractedLinks_fullCorpus.txt" file2 = "data/corpus/Metalogue_Corpus_NegativePhrases.txt" file3 = "data/corpus/IBM_extracted_raw.txt" CL = CorpusLoader() CL.load(file3) #CL.add_Corpus(file2) #CL.mergeLabel("justification", "evidence", "contingency") CL.stats(CL.data) print("DONE") return CL.data
def __init__(self, datasetFile, imagesDir, textDir, split, arrangement, sampling): self.datasetFile = datasetFile self.imagesDir = imagesDir self.textDir = textDir self.split = split self.arrangement = easydict.EasyDict(arrangement) self.sampling = easydict.EasyDict(sampling) self.images_classes = {} self.assign_classes() cl = CorpusLoader(datasetFile=datasetFile, textDir=textDir) self.vectorizer = cl.TrainVocab()
def load_corpus(self, name, files, min=15, max=100, merge=False): ''' :param name: key for dictionary entry in self.corpora :param files: list of files :param min, max: min and max length of sentences :param merge: one or two text elements. one if true :return: None ''' CL = CorpusLoader(files[0], min, max) if len(files) > 1: iterfiles = iter(files) next(iterfiles) for file in iterfiles: CL.add_Corpus(file, min, max) if merge: CL.mergeData() CL.containing.append(name) CL.tokenize() corpus = self.tax.expandTax(CL) self.corpora[name] = corpus
def load_corpus(self, name, files, min=15, max= 100, merge=False): ''' :param name: key for dictionary entry in self.corpora :param files: list of files :param min, max: min and max length of sentences :param merge: one or two text elements. one if true :return: None ''' CL = CorpusLoader(files[0], min, max) if len(files) > 1: iterfiles = iter(files) next(iterfiles) for file in iterfiles: CL.add_Corpus(file, min, max) if merge: CL.mergeData() CL.containing.append(name) CL.tokenize() corpus = self.tax.expandTax(CL) self.corpora[name] = corpus
def mergeCorpora(self, corpora): ''' merges the corpora into one new CL object :param corpora: list of self.corpora keys :return: CL ''' merge = [] CL = CorpusLoader() for corpus in corpora: merge.append(self.corpora[corpus]) CL.containing.append(corpus) CL.mergeWithCorpus(merge) return CL
def __init__(self): self.corpusLoader = CorpusLoader() self.corpus = self.corpusLoader.load_corpus() self.input_sentences = list(self.corpus.keys()) logging.debug(pformat(self.corpus)) self.lemmer = WordNetLemmatizer() self.tfIdfVec = TfidfVectorizer(tokenizer=self.tokenize) self.similarity_threshold = 0.30 # Keyword Matching self.GREETING_INPUTS = ( "hello", "hi", "greetings", "sup", "what's up", "hey", ) self.GREETING_RESPONSES = ("hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me")
def load_corpus(name, files, merge=True): CL = CorpusLoader(files[0], min, max) CL.add_Corpus(files[1], min, max) if merge: CL.mergeData() corpora[name] = CL print(name + " loaded...")
class Robo: def __init__(self): self.corpusLoader = CorpusLoader() self.corpus = self.corpusLoader.load_corpus() self.input_sentences = list(self.corpus.keys()) logging.debug(pformat(self.corpus)) self.lemmer = WordNetLemmatizer() self.tfIdfVec = TfidfVectorizer(tokenizer=self.tokenize) self.similarity_threshold = 0.30 # Keyword Matching self.GREETING_INPUTS = ( "hello", "hi", "greetings", "sup", "what's up", "hey", ) self.GREETING_RESPONSES = ("hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me") def lemmatize(self, tokens): """ Lemmatizes a list of words / tokens. Takes as input the list of words and after lemmatizing each one returns a new list with the result. Args: tokens(:obj:`list` of :obj:`str`): List of words to be lemmatized Returns: (:obj:`list` of :obj:`str`): A list of lemmatized words """ return [self.lemmer.lemmatize(token) for token in tokens] # Tokenize, convert to lowercase, remove punctuation and then lemmatize def tokenize(self, text): """ Splits (tokenizes) a text into discreet words. Apart from the tokenization it applies some pre and post processsing Pre-processing: convert text to lowercase, remove any punctuation Post-processing: lemmatize each token. Args: text(str): the input text Returns: (:obj:`list` of :obj:`str`): a "bag of words" """ return self.lemmatize( nltk.word_tokenize(text.lower().translate( str.maketrans('', '', string.punctuation)))) def isGreeting(self, sentence): """ Checks if the provided sentence is considered a greeting or not. Args: sentence(str): A user provided sentence that might be a greeting or not Returns: bool: True if the sentence is a greeting and False if not. """ for word in sentence.split(): if word.lower() in self.GREETING_INPUTS: return True return False def greet(self): """ Returns one of the GREETING_RESPONSES at random """ return random.choice(self.GREETING_RESPONSES) def help(self): return """I like telling jokes, gossip and chat in general. I'm pretty knowledgeable about the following topics: * AI * Bots * Computers * Food * History * Literature * Money * Movies * Politics * Psychology * Science * Sports * Trivia """ def get_response(self, user_input): """ Takes user input and tries to retrieve an appropriate response. Args: user_input (str): The user input :) Returns: str: The response to give to the user """ tfidf = self.tfIdfVec.fit_transform(self.input_sentences + [user_input]) logging.info(self.tfIdfVec.get_feature_names()) logging.info(tfidf.shape) vals = cosine_similarity(tfidf[-1], tfidf[:-1]).flatten() highest_similarity_idx = vals.argsort()[-1] highest_similarity = vals[highest_similarity_idx] if (highest_similarity <= self.similarity_threshold): return "I am sorry! I don't understand you" else: reply_key = self.input_sentences[highest_similarity_idx] logging.debug(self.corpus[reply_key]) if len(self.corpus[reply_key]) > 1: return random.choice(self.corpus[reply_key]) else: return self.corpus[reply_key][0]