def __iter__(self):
        if self.files != None:
            # Iterating over a list of file paths
            for input_file in self.files:
                for text in file(input_file, "rb"):
                    for line in text2sentences(text,remove_non_english_chars=self.remove_non_english_chars):            
                        words = sentence2words(line,remove_stop_words=self.remove_stop_words)
                        if len(words) < 3: continue    
                        yield words

        else:
            # Iterating over a list of text
            for text in self.text_list:
                for line in text2sentences(text,remove_non_english_chars=self.remove_non_english_chars):            
                    words = sentence2words(line,remove_stop_words=self.remove_stop_words)
                    if len(words) < 3: continue    
                    yield words
Exemple #2
0
 def text2vectors(self, text):
     """
     Convert input text into an iterator that returns the corresponding vector representation of each
     word in the text, if it exists in the Word2Vec model
     :param txt: input text
     :param is_html: if True, then extract the text from the input HTML
     :return: iterator of vectors created from the words in the text using the Word2Vec model.
     """
     words = sentence2words(text)
     words = [w for w in words if w in self.model]
     if len(words) != 0:
         for w in words:
             yield self.model[w]
 def text2vectors(self,text):
     """
     Convert input text into an iterator that returns the corresponding vector representation of each
     word in the text, if it exists in the Word2Vec model
     :param txt: input text
     :param is_html: if True, then extract the text from the input HTML
     :return: iterator of vectors created from the words in the text using the Word2Vec model.
     """
     words = sentence2words(text)
     words = [w for w in words if w in self.model]
     if len(words) != 0:
         for w in words:
             yield self.model[w]
Exemple #4
0
    def __iter__(self):
        if self.files != None:
            # Iterating over a list of file paths
            for input_file in self.files:
                for text in file(input_file, "rb"):
                    for line in text2sentences(text,
                                               remove_non_english_chars=self.
                                               remove_non_english_chars):
                        words = sentence2words(
                            line, remove_stop_words=self.remove_stop_words)
                        if len(words) < 3: continue
                        yield words

        else:
            # Iterating over a list of text
            for text in self.text_list:
                for line in text2sentences(text,
                                           remove_non_english_chars=self.
                                           remove_non_english_chars):
                    words = sentence2words(
                        line, remove_stop_words=self.remove_stop_words)
                    if len(words) < 3: continue
                    yield words
Exemple #5
0
    def text2vectors(self, text):
        '''Convert input text into an iterator that returns the corresponding 
        vector representation of each word in the text, if it exists in the 
        Word2Vec model

        Parameters
        ==========
        txt: input text
        returns iterator of vectors, from txt using the Word2Vec model.
        '''
        words = sentence2words(text)
        words = [w for w in words if w in self.model]
        if len(words) != 0:
            for w in words:
                yield self.model.wv.__getitem__(w)
Exemple #6
0
    def __iter__(self):

        if self.files != None:

            # Iterating over a list of file paths
            for input_file in self.files:
                for text in open(input_file, "r").readlines():
                    for line in text2sentences(text,
                                               remove_non_english_chars=self.
                                               remove_non_english_chars):
                        words = sentence2words(
                            line, remove_stop_words=self.remove_stop_words)
                        yield words

        else:

            # Iterating over a list of text
            for text in self.text_list:
                for line in text2sentences(text,
                                           remove_non_english_chars=self.
                                           remove_non_english_chars):
                    words = sentence2words(
                        line, remove_stop_words=self.remove_stop_words)
                    yield words