Python CorpusLoader Examples, corpus_loader.CorpusLoader Python Examples

Example #1

0

Show file

File: maker.py Project: 282560/Text2Image-cs

    def __init__(self,
                 datasetFile,
                 textDir,
                 checking_folder,
                 lang,
                 client_txt,
                 pre_trained_gen,
                 pre_trained_disc,
                 ID,
                 batch_size=1):

        self.generator = torch.nn.DataParallel(
            gan_factory.generator_factory('gan').cuda())
        self.generator.load_state_dict(torch.load(pre_trained_gen))

        self.discriminator = torch.nn.DataParallel(
            gan_factory.discriminator_factory('gan').cuda())
        self.discriminator.load_state_dict(torch.load(pre_trained_disc))

        self.checking_folder = checking_folder
        self.lang = lang
        self.client_txt = client_txt
        self.filename = ID
        self.batch_size = batch_size

        cl = CorpusLoader(datasetFile=datasetFile, textDir=textDir)
        self.vectorizer = cl.TrainVocab()

Example #2

0

Show file

File: runClassifier.py Project: tomaye/Thesis

def load_corpus(name, files, merge = True):

    CL = CorpusLoader(files[0], min, max)
    CL.add_Corpus(files[1],min, max)

    if merge:
        CL.mergeData()

    corpora[name] = CL

    print(name+ " loaded...")

Example #3

0

Show file

File: plotWordFreq.py Project: tomaye/Thesis

def loadData():
    file = "data/corpus/Metalogue_extractedLinks_fullCorpus.txt"
    file2 = "data/corpus/Metalogue_Corpus_NegativePhrases.txt"
    file3 = "data/corpus/IBM_extracted_raw.txt"

    CL = CorpusLoader()
    CL.load(file3)

    #CL.add_Corpus(file2)
    #CL.mergeLabel("justification", "evidence", "contingency")
    CL.stats(CL.data)
    print("DONE")

    return CL.data

Example #4

0

Show file

File: txt2image_dataset2.py Project: 282560/Text2Image

    def __init__(self, datasetFile, imagesDir, textDir, split, arrangement,
                 sampling):
        self.datasetFile = datasetFile
        self.imagesDir = imagesDir
        self.textDir = textDir
        self.split = split
        self.arrangement = easydict.EasyDict(arrangement)
        self.sampling = easydict.EasyDict(sampling)

        self.images_classes = {}
        self.assign_classes()

        cl = CorpusLoader(datasetFile=datasetFile, textDir=textDir)
        self.vectorizer = cl.TrainVocab()

Example #5

0

Show file

    def load_corpus(self, name, files, min=15, max=100, merge=False):
        '''
        :param name: key for dictionary entry in self.corpora
        :param files: list of files
        :param min, max: min and max length of sentences
        :param merge: one or two text elements. one if true
        :return: None
        '''

        CL = CorpusLoader(files[0], min, max)

        if len(files) > 1:
            iterfiles = iter(files)
            next(iterfiles)
            for file in iterfiles:
                CL.add_Corpus(file, min, max)

        if merge:
            CL.mergeData()

        CL.containing.append(name)
        CL.tokenize()

        corpus = self.tax.expandTax(CL)

        self.corpora[name] = corpus

Example #6

0

Show file

File: pipeline.py Project: tomaye/Thesis

    def load_corpus(self, name, files, min=15, max= 100, merge=False):
        '''
        :param name: key for dictionary entry in self.corpora
        :param files: list of files
        :param min, max: min and max length of sentences
        :param merge: one or two text elements. one if true
        :return: None
        '''

        CL = CorpusLoader(files[0], min, max)

        if len(files) > 1:
            iterfiles = iter(files)
            next(iterfiles)
            for file in iterfiles:
                CL.add_Corpus(file, min, max)

        if merge:
            CL.mergeData()

        CL.containing.append(name)
        CL.tokenize()

        corpus = self.tax.expandTax(CL)

        self.corpora[name] = corpus

Example #7

0

Show file

File: pipeline.py Project: tomaye/Thesis

    def mergeCorpora(self, corpora):
        '''
        merges the corpora into one new CL object
        :param corpora: list of self.corpora keys
        :return: CL
        '''

        merge = []
        CL = CorpusLoader()

        for corpus in corpora:

            merge.append(self.corpora[corpus])
            CL.containing.append(corpus)

        CL.mergeWithCorpus(merge)

        return CL

Example #8

0

Show file

    def mergeCorpora(self, corpora):
        '''
        merges the corpora into one new CL object
        :param corpora: list of self.corpora keys
        :return: CL
        '''

        merge = []
        CL = CorpusLoader()

        for corpus in corpora:

            merge.append(self.corpora[corpus])
            CL.containing.append(corpus)

        CL.mergeWithCorpus(merge)

        return CL

Example #9

0

Show file

File: chatbot.py Project: gvasilei/chatbot-ai-workshop

    def __init__(self):
        self.corpusLoader = CorpusLoader()
        self.corpus = self.corpusLoader.load_corpus()
        self.input_sentences = list(self.corpus.keys())
        logging.debug(pformat(self.corpus))

        self.lemmer = WordNetLemmatizer()
        self.tfIdfVec = TfidfVectorizer(tokenizer=self.tokenize)
        self.similarity_threshold = 0.30

        # Keyword Matching
        self.GREETING_INPUTS = (
            "hello",
            "hi",
            "greetings",
            "sup",
            "what's up",
            "hey",
        )
        self.GREETING_RESPONSES = ("hi", "hey", "*nods*", "hi there", "hello",
                                   "I am glad! You are talking to me")

Example #10

0

Show file

def load_corpus(name, files, merge=True):

    CL = CorpusLoader(files[0], min, max)
    CL.add_Corpus(files[1], min, max)

    if merge:
        CL.mergeData()

    corpora[name] = CL

    print(name + " loaded...")

Example #11

0

Show file

File: plotWordFreq.py Project: tomaye/Thesis

def loadData():
    file = "data/corpus/Metalogue_extractedLinks_fullCorpus.txt"
    file2 = "data/corpus/Metalogue_Corpus_NegativePhrases.txt"
    file3 = "data/corpus/IBM_extracted_raw.txt"

    CL = CorpusLoader()
    CL.load(file3)

    #CL.add_Corpus(file2)
    #CL.mergeLabel("justification", "evidence", "contingency")
    CL.stats(CL.data)
    print("DONE")

    return CL.data

Example #12

0

Show file

File: chatbot.py Project: gvasilei/chatbot-ai-workshop

class Robo:
    def __init__(self):
        self.corpusLoader = CorpusLoader()
        self.corpus = self.corpusLoader.load_corpus()
        self.input_sentences = list(self.corpus.keys())
        logging.debug(pformat(self.corpus))

        self.lemmer = WordNetLemmatizer()
        self.tfIdfVec = TfidfVectorizer(tokenizer=self.tokenize)
        self.similarity_threshold = 0.30

        # Keyword Matching
        self.GREETING_INPUTS = (
            "hello",
            "hi",
            "greetings",
            "sup",
            "what's up",
            "hey",
        )
        self.GREETING_RESPONSES = ("hi", "hey", "*nods*", "hi there", "hello",
                                   "I am glad! You are talking to me")

    def lemmatize(self, tokens):
        """
        Lemmatizes a list of words / tokens. Takes as input the list of words and after lemmatizing each one returns a new list with the result.

        Args:
            tokens(:obj:`list` of :obj:`str`): List of words to be lemmatized

        Returns:
            (:obj:`list` of :obj:`str`): A list of lemmatized words
        """
        return [self.lemmer.lemmatize(token) for token in tokens]

    # Tokenize, convert to lowercase, remove punctuation and then lemmatize
    def tokenize(self, text):
        """
        Splits (tokenizes) a text into discreet words. Apart from the tokenization it applies some pre and post processsing

        Pre-processing: convert text to lowercase, remove any punctuation
        Post-processing: lemmatize each token.

        Args:
            text(str): the input text
        
        Returns:
            (:obj:`list` of :obj:`str`): a "bag of words"
        """
        return self.lemmatize(
            nltk.word_tokenize(text.lower().translate(
                str.maketrans('', '', string.punctuation))))

    def isGreeting(self, sentence):
        """
        Checks if the provided sentence is considered a greeting or not.

        Args:
            sentence(str): A user provided sentence that might be a greeting or not

        Returns:
            bool: True if the sentence is a greeting and False if not.
        """
        for word in sentence.split():
            if word.lower() in self.GREETING_INPUTS: return True
        return False

    def greet(self):
        """
        Returns one of the GREETING_RESPONSES at random
        """
        return random.choice(self.GREETING_RESPONSES)

    def help(self):
        return """I like telling jokes, gossip and chat in general. I'm pretty knowledgeable about the following topics:

        * AI
        * Bots
        * Computers
        * Food
        * History
        * Literature
        * Money
        * Movies
        * Politics
        * Psychology
        * Science
        * Sports
        * Trivia
        """

    def get_response(self, user_input):
        """
        Takes user input and tries to retrieve an appropriate response.
            
        Args:
            user_input (str): The user input :)

        Returns:
            str: The response to give to the user
        """
        tfidf = self.tfIdfVec.fit_transform(self.input_sentences +
                                            [user_input])

        logging.info(self.tfIdfVec.get_feature_names())
        logging.info(tfidf.shape)

        vals = cosine_similarity(tfidf[-1], tfidf[:-1]).flatten()
        highest_similarity_idx = vals.argsort()[-1]
        highest_similarity = vals[highest_similarity_idx]

        if (highest_similarity <= self.similarity_threshold):
            return "I am sorry! I don't understand you"
        else:
            reply_key = self.input_sentences[highest_similarity_idx]
            logging.debug(self.corpus[reply_key])
            if len(self.corpus[reply_key]) > 1:
                return random.choice(self.corpus[reply_key])
            else:
                return self.corpus[reply_key][0]