def __init__(self):
     self.mongo_operation = MongoOperation()
     self.ltp_parser = LtpParser()
     self.words_embedding_file = "../Data/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.txt"
     self.wv_from_text = KeyedVectors.load_word2vec_format(
         self.words_embedding_file, binary=False, limit=100000)
     self.wv_from_text.init_sims(replace=True)
Esempio n. 2
0
class NewsSpiderPipeline(object):
    def __init__(self):
        self.file = FileOperation()
        self.db = MongoOperation()

    def process_item(self, item, spider):
        self.file.get_news_list_file(item)
        self.db.news_db_add(item)
        return item

    def spider_closed(self, spider):
        pass
class EventVector:
    def __init__(self):
        self.mongo_operation = MongoOperation()
        self.ltp_parser = LtpParser()
        self.words_embedding_file = "../Data/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.txt"
        self.wv_from_text = KeyedVectors.load_word2vec_format(
            self.words_embedding_file, binary=False, limit=100000)
        self.wv_from_text.init_sims(replace=True)

    # 从数据库中得到事件三组元按“,”划分的事件,并不重复
    def get_event_from_triple(self):
        event_triple_sets = self.mongo_operation.event_db_get()
        event_sets = list()
        for event_triple in event_triple_sets:
            event = event_triple.split(',')[0]
            if event not in event_sets:
                event_sets.append(event)
            event = event_triple.split(',')[2]
            if event not in event_sets:
                event_sets.append(event)
        return event_sets

    # 得到事件的向量(词向量平均得到)
    def get_event_vectors(self, event_sets):
        events_list = list()
        for event in event_sets:
            event_dict = dict()
            words_list = self.ltp_parser.get_words_by_pyltp(event)
            # print(words_list)
            # 建立长度向量长度的0向量
            vector_sum = [
                0 for index in range(self.wv_from_text.wv.syn0[0].shape[0])
            ]
            for word in words_list:
                vector = self.wv_from_text[word]
                # print(vector)
                vector_sum = list(
                    numpy.array(vector_sum) + numpy.array(vector))
            # print(vector_sum)
            event_vector = [i / len(words_list) for i in vector_sum]
            event_dict["event"] = event
            event_dict["vector"] = event_vector
            events_list.append(event_dict)
        return events_list
Esempio n. 4
0
 def __init__(self):
     self.file = FileOperation()
     self.db = MongoOperation()
Esempio n. 5
0
from event_extractor import EventExtrator
from event_relation_extractor import EventRelationExtractor
from IO.file_operation import FileOperation
from IO.database_operation import MongoOperation

if __name__ == '__main__':
    content = ""
    # content = '''
    # 部分研究结果显示即使在病毒传播密集地区,已有抗体的人群比例仍很低,意味着绝大多数人易感。
    # 如果想回到没有封锁措施的社会,需要等待一个中长期过程。
    # 因为我已经成功完成了作业了,因此我可以出去玩。
    # 我出去玩,结果摔了一跤。
    # '''
    # print(content)
    file_operation = FileOperation()
    db_operation = MongoOperation()
    # 定义读取行
    flag = 0
    # 读取文件n行到n+5行的句子
    text_list = file_operation.get_file_rows_list(
        file_operation.TEST_TEXT_FILE, flag, flag + 5)
    for index in range(len(text_list)):
        content += text_list[index]
    event_sets_list = list()
    event_extractor = EventExtrator()
    event_relation_extractor = EventRelationExtractor()
    event_relations_list = event_relation_extractor.event_relation_extrator_main(
        content)
    for event_relation in event_relations_list:
        if event_relation:
            pre_event = event_extractor.event_extrator_main(event_relation[0])
 def __init__(self):
     # fd = open(self.EVENT_RELATIONS_LIST_FILE_NAME, 'r')
     # self.event_triple_sets = fd.readlines()
     db = MongoOperation()
     self.event_triple_sets = db.event_db_get()
     print('\n'.join(self.event_triple_sets))
        for i in range(numiter):
            flag = 0
            # E step % update the center for each cluster
            for cluster in self.clusters:
                cluster.update_center()

            # M step % assign the point to the newest cluster
            for point in self.points:
                flag += self.assign_point_cluster(point)

            if (flag == 0):
                break


if __name__ == "__main__":
    mongo_operation = MongoOperation()
    vectors_list = mongo_operation.vector_db_get()
    dataset = list()
    for vector_dict in vectors_list:
        dataset.append(vector_dict["vector"])
    # dataset = [[1, 1], [2, 3], [-1, 4], [5, 2], [-3, -7], [4, -2], [4, 2], [3, 3], [-2, 3], [-5, -3], [2, 5], [1,
    # -2], [3, 0], [0, 0], [1, 6], [1, 7]]
    kmean = Kmeanplusplus(dataset, 4)
    kmean.do_cluster(1000)
    for cluster in kmean.clusters:
        print("============")
        print("cluster:")
        print(cluster.center)
        for point in cluster.points:
            print(point.data)
            for vector_dict in vectors_list: