예제 #1
0
class AutoTransferHandler():
    def __init__(self, database_name, collection_name_from,
                 collection_name_to):
        self.config_from = MongoConfig(database_name=database_name,
                                       collection_name=collection_name_from,
                                       use_localhost=False)
        self.config_to = MongoConfig(database_name=database_name,
                                     collection_name=collection_name_to,
                                     use_localhost=False)
        self.cursor_from = self.config_from.GetMongoCursor()
        self.cursor_to = self.config_to.GetMongoCursor()
        self.count = 0

    def start(self):
        print('Transfer Begin')
        for document in self.cursor_from.find():
            self.cursor_to.insert_one(document)
            self.count += 1
            print('Insert success document {}'.format(self.count))
        print('Transfer Done.')

        self.config_from.CloseConnection()
        self.config_to.CloseConnection()
예제 #2
0
def Summary():
    config = MongoConfig(database_name='TechHub',
                         collection_name='zhihu',
                         use_localhost=False)
    cursor = config.GetMongoCursor()
    summary_handler = AutoSummaryHandler()
    count = 0

    for document in cursor.find():
        content = document['content']
        url = document['url']
        if len(content) < 10:
            cursor.delete_one({"url": url})
            print('Remove too short document')
        else:
            summary = summary_handler.GetSummary(content=content)
            cursor.update_one({'url': url}, {'$set': {'summary': summary}})
            print('Finish summary for document {}'.format(count))
        count += 1

    # release resources
    config.CloseConnection()
예제 #3
0
from common.Config import MongoConfig

if __name__ == '__main__':
    config = MongoConfig(database_name='TechHub', collection_name='CSDN')
    cursor = config.GetMongoCursor()
    count = 0

    for document in cursor.find():
        url = document['url']
        new_time = document['date'].split()[0]
        new_time = new_time.replace("年", "-")
        new_time = new_time.replace("月", "-")
        new_time = new_time.replace("日", "")
        if len(new_time) != 10:
            raise Exception("error")
        cursor.update_one({'url': url}, {'$set': {'date': new_time}})
        count += 1
        print('Finish update document {}'.format(count))
    print("done")
    config.CloseConnection()
예제 #4
0
class DataFilterHandler():
    def __init__(self,
                 database_name,
                 collection_name,
                 use_localhost=False,
                 use_similarity_filter=True):
        self.mongo_config = MongoConfig(database_name=database_name,
                                        collection_name=collection_name,
                                        use_localhost=use_localhost)
        self.collection_cursor = self.mongo_config.GetMongoCursor()
        self.use_similarity_filter = use_similarity_filter
        self.similarity_filter = SimHashFilter(128)
        self.count = 0
        self.code_list = []

    # filter with distinct url and similarity
    def start(self):
        cursor = self.collection_cursor
        url_set = set()

        for document in cursor.find():
            url = document['url']
            content = document['content']

            # first filter same url
            if url in url_set and len(url_set) != 0:
                cursor.remove({"url": url})
                print('Duplicate url for {}'.format(url))
                continue

            url_set.add(url)

            # then filter similarity threshold
            if self.use_similarity_filter == True:
                self.FilterSimilarity(cursor, url, content)
            else:
                print('Valid document for position: {}'.format(self.count))
                self.count += 1

        self.mongo_config.CloseConnection()
        print('Data filter done.')

    def FilterSimilarity(self, cursor, url, content):
        valid_flag = True

        code = self.similarity_filter.GetCodeForText(text=content)
        if len(self.code_list) == 0:
            self.code_list.append(code)
        else:
            list_length = len(self.code_list)
            for index in range(list_length):
                c = self.code_list[index]
                (similar,
                 similarity) = self.similarity_filter.IsSimilarByCode(c, code)
                if similar == True:
                    cursor.remove({"url": url})
                    print('Delete document for {}, similarity is {}'.format(
                        url, similarity))
                    valid_flag = False
                    break
                else:
                    continue

            if valid_flag == True:
                print('Valid document for position: {}'.format(self.count))
                self.code_list.append(code)

        self.count += 1