Example #1
0
    def load_file_to_dict(self,
                          filename: str,
                          cols: List[int] = None) -> Dict[str, int]:
        """Load columns of a csv file to word_dict.

        Args:
            filename: a csv file with ',' as separator
            cols: column indexes to be added to vocab

        Returns:
            word_dict: {<word>: frequency}
        """
        data_frame = pd.read_csv(filename)
        if not cols:
            cols = range(data_frame.shape[1])
        for row in data_frame.itertuples(index=False):
            for i in cols:
                sentence = str(row[i])
                if self.language == 'zh':
                    words = lawa.lcut(sentence)
                else:  # 'en'
                    words = nltk.word_tokenize(sentence)
                for word in words:
                    self.word_dict[word] = self.word_dict.get(word, 0) + 1
        return self.word_dict
Example #2
0
    def tokenize(sentence: str) -> List[str]:
        """Cut words for a sentence.

        Args:
            sentence: sentence

        Returns:
            words list
        """
        return lawa.lcut(sentence)
Example #3
0
    def tokenize(self, sentence: str) -> List[str]:
        """Cut words for a sentence.

        Args:
            sentence: sentence

        Returns:
            words list
        """
        if self.language == 'zh':
            words = lawa.lcut(sentence)
        else:  # 'en'
            words = nltk.word_tokenize(sentence)
        return words
Example #4
0
def get_test(positive):
    positive = positive.head(10)
    # 文本分词
    mycut = lambda s: ' '.join(lawa.lcut(s)) # 自定义分词函数
    po =positive.content.apply(mycut)

    # ne =negtive.comment.apply(mycut)

    #停用词过滤(停用词文本可以自己写,一行一个或者用别人整理好的,我这是用别人的)
    # with open(r'C:\Users\Administrator\Desktop\python\项目\电商评论情感分析\stoplist.txt',encoding = 'utf-8') as f:
    #     stop  = f.read()
    # stop =[' ',''] +list(stop[0:])  # 因为读进来的数据缺少空格,我们自己添加进去

    po = po.apply(lambda  s: s.split(' ')) # 将分词后的文本以空格切割
    return po
Example #5
0
    def load_user_log(self, filename="data/user_search.csv"):
        df = pandas.read_csv(filename)
        topics, properties = [], []
        for row in df.itertuples(index=False):
            words = list(lawa.lcut(row[1]))
            pos_corpus = self.pos_dic.doc2bow(words)
            list_topic = self.pos_lda.get_document_topics(pos_corpus)
            topic = torch.zeros((1,self.pos_lda.num_topics), device='cuda')
            for id, top in list_topic:
                topic[0,id] += top
            user_id = self.username_embedding[row[0]]
            property = self.userproperty_embedding[user_id]
            topics.append(topic)
            properties.append(property)

        topics = torch.cat(topics)
        properties = torch.cat(properties)
        return TensorDataset(topics,  properties)
Example #6
0
model = KeyedVectors.load_word2vec_format('model/word2vec.txt', binary=False)
df = pandas.read_csv("data/summary.csv",
                     sep=',',
                     names=['gid', 'summary', 'sort'])
adf = df[df.summary.notnull()]
total = len(adf)
print(total)
dim = model.wv.vector_size
vocab = set(model.wv.vocab.keys())
with open("modelsummary/summary-512.txt", 'w', encoding='utf-8') as file:
    file.write(str(total) + " " + str(dim) + '\n')

    for row in tqdm(adf.itertuples(index=False)):
        gid = str(row[0])
        summary = str(row[1])
        words = lawa.lcut(summary)
        vecs = numpy.array([model[w] for w in words if w in vocab])
        if len(vecs):
            docvec = numpy.mean(vecs, axis=0)
        else:
            docvec = model['。']
        vecline = (gid + ' ' +
                   numpy.array2string(docvec,
                                      separator=' ',
                                      max_line_width=10**10,
                                      precision=8,
                                      floatmode='fixed',
                                      suppress_small=True).strip('[]') +
                   '\n').replace("  ", " ", 10**10)
        file.write(vecline)
        #print('.')
Example #7
0
                    help='model config file')
args = parser.parse_args()

positive = pd.read_csv(args.data_file, encoding='utf-8')
# negtive = pd.read_excel(r'C:\Users\Administrator\Desktop\python\项目\爬虫\京东评论\com_neg.xls',encoding = 'utf-8')
# 文本去重(文本去重主要是一些系统自动默认好评的那些评论 )
# positive = positive['content'].drop_duplicates()
# positive = positive['content']
# negtive = negtive ['comment'].drop_duplicates()
# negtive = negtive['comment']
type1 = positive['type1'].drop_duplicates()
print('类型:', len(type1), type1.tolist())

#positive = positive.head(10)
# 文本分词
mycut = lambda s: ' '.join(lawa.lcut(str(s)))  # 自定义分词函数
po = positive.content.apply(mycut)

# ne =negtive.comment.apply(mycut)

# 停用词过滤(停用词文本可以自己写,一行一个或者用别人整理好的,我这是用别人的)
# with open(r'C:\Users\Administrator\Desktop\python\项目\电商评论情感分析\stoplist.txt',encoding = 'utf-8') as f:
#     stop  = f.read()
# stop =[' ',''] +list(stop[0:])  # 因为读进来的数据缺少空格,我们自己添加进去

po = po.apply(lambda s: s.split(' '))  # 将分词后的文本以空格切割

# po['2'] = po['1'].apply(lambda x:[i for i in x if i not in stop])# 过滤停用词

# 在这里我们也可以用到之前的词云图分析
# post = []