コード例 #1
0
def cleanText(input_list):
    # declare and init textCleaner
    text_cleaner = TextCleaner(rm_punct = True, rm_digits = True, rm_hashtags = False)

    # cleaning text
    cleaned_list = []
    for text in input_list:
        if (str(text) != "nan"):
            text = text.replace(" /", "/")
            text = re.sub(r'\b\w{1,3}\b', '', text)
            cleaned_list.append(text_cleaner.regex_applier(text))

    #REMOVE STOPWORDS
    no_stopwords_sentences = []
    stopwords_dict = stopwordsDictFromFile(conf.ConfigSectionMap('STOPWORDS_FILES'))
    for text in cleaned_list:
        if (str(text) != "nan"):
            words_no_stopwords = []
            words = utilities.convertSentenceToListOfWords(text)
            for word in words:
                if not stopwords_dict.has_key(word.lower().encode("utf-8")):
                    words_no_stopwords.append(word)
            no_stopwords_sentences.append(utilities.concatWords(words_no_stopwords))


    return no_stopwords_sentences
コード例 #2
0
    def computeWord2Tweets(self):
        """
        computes dictionary for each vector the list of the tweets related
        """
        self.vec2tweets = {}
        self.vec2word = {}
        self.word2tweet = {}

        for key in self.model.wv.vocab.keys():
            vec = self.model[key]
            self.vec2tweets[utilities.convertListOfNumToString(vec)] = []
            self.word2tweet[key] = []

        for i in range(len(self.cleaned_tweets)):
            for word in utilities.convertSentenceToListOfWords(
                    self.cleaned_tweets[i]):
                try:
                    vec = self.model.wv[word]
                    self.vec2tweets[utilities.convertListOfNumToString(
                        vec)].append(self.tweets[i])
                    self.vec2word[utilities.convertListOfNumToString(
                        vec)] = word
                    self.word2tweet[word].append(self.tweets[i])
                except:
                    pass
コード例 #3
0
def getEmbeddedWords(sentences, model=None):

    log.info("getting Embedded Words from input")

    # load trained model W2V
    if (model is None):
        model = Word2Vec.load(conf.get('MAIN', 'path_pickle_w2v_model'))

    list_of_sentences = []
    embedded_words = []
    dict_index2word_tweet = OrderedDict({})
    dict_word2index_tweet = OrderedDict({})
    row_index = 0

    # from list of sentence to list of list of words
    for text in sentences:
        if (str(text) != "nan"):
            list_of_sentences.append(
                utilities.convertSentenceToListOfWords(text))

    for sentence in list_of_sentences:
        for word in sentence:
            try:
                # if word is not yet inserted
                if word not in dict_word2index_tweet:
                    embedded_words.append(model.wv[word])
                    dict_index2word_tweet[row_index] = word
                    dict_word2index_tweet[word] = row_index
                    row_index += 1
            # we are here if w2v model does not have the word
            except KeyError:
                pass

    return numpy.array(
        embedded_words), dict_index2word_tweet, dict_word2index_tweet
コード例 #4
0
def trainW2Vmodel(corpus, new_model=False, identifier=""):
    log.info("Training W2V Model")

    # covert list of sentences to list of words
    list_corpus = []
    for text in corpus:
        if (str(text) != "nan"):
            list_corpus.append(utilities.convertSentenceToListOfWords(text))
    corpus = list_corpus

    # count num of words
    flat_list = [item for sublist in corpus for item in sublist]
    num_words = len(flat_list)

    debugging = conf.get('MAIN', 'debugging')

    if (debugging == 'True'):
        n_epoch = 10000
    else:
        n_epoch = int(10**9 / num_words)

    #epochs = iter
    model = Word2Vec(size=conf.getint('W2V', 'size'),
                     min_count=conf.getint('W2V', 'min_count'),
                     sg=conf.getint('W2V', 'sg'),
                     window=conf.getint('W2V', 'window'),
                     iter=n_epoch,
                     alpha=conf.getfloat('W2V', 'alpha'),
                     workers=conf.getint('W2V', 'workers'))

    model.build_vocab(corpus, progress_per=conf.getint('W2V', 'progress_per'))
    #Train the model over train_reviews (this may take several minutes)

    model.train(corpus, total_examples=model.corpus_count, epochs=model.iter)

    if (new_model == False):
        model.save(conf.get('MAIN', 'path_pickle_w2v_model'))
        log.info("Model trained")
    else:
        filename = conf.get('MAIN', 'path_pickle_w2v_model_incr_fold'
                            ) + "word2vec_" + str(identifier) + ".pickle"
        model.save(filename)