Python tokenize Exemples, text_seg.tokenize Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : content_extractor.py Projet : anukat2015/WebdataPipeline

    def parse_document(self):
        try:
            document = {}
            raw = self.source.readlines()

            document['src_file'] = self.file_path

            document['source_url'] = raw[0].strip()
            document['timestamp'] = parse_time(raw[1].strip())
            document['domain'] = urlparse(document['source_url']).netloc

            html = ''.join(raw[2:]).strip()

            text = extract(html)
            document['text'] = text
            document['tokens'] = tokenize(text)
            document['new_attr'] = 1
            # print document['tokens']
        except Exception as e:
            # print e, 'Some error has occured, continue'
            return False

        # print document['text']
        if self.verify_document(document):
            self.document = document
        else:
            return False
        return True

Exemple #2

0

Afficher le fichier

Fichier : content_extractor.py Projet : KeithYue/WebdataPipeline

def update_socialmedial(src, dist):
    '''
    para src: the source collection name, string.
    para dist: the destination collection name, string.
    the weibo data is a little bit different, weibo data is already in the database
    '''
    try:
        cursor = db[src].find({'tokens': {
            '$exists': 0
            }}, timeout=False)
        cursor.batch_size(100)
        for weibo in cursor:
            try:
                text = weibo['value']['content']
                for comment in weibo['value']['comment_comment']:
                    text = '\n'.join([text, comment['comment']])
                # logging.info(text)
                tokens = tokenize(text)
                logging.info('Parsing {} collection with the document id {}'.format(src, weibo['_id']))
                # logging.info('updating the weibo %s' % weibo['value']['mid'])
                weibo['text'] = text
                weibo['tokens'] = tokens
                weibo['timestamp'] = datetime.datetime.fromtimestamp(float(weibo['value']['created_at']))
                # remove the _id key to remove the dulplicate key error
                del weibo['_id']
                # update the document to the weibo collection, not the original src collection
                # print(weibo)
                db[dist].update({'key': weibo['key']}, weibo, True) # use upsert to insert the document
            except Exception as e:
                logging.error(e)
                logging.error('updating failure, press enter to continue...')
                continue
    finally:
        cursor.close()
    return True

Exemple #3

0

Afficher le fichier

Fichier : content_extractor.py Projet : KeithYue/WebdataPipeline

    def parse_document(self):
        try:
            document = {}
            raw = self.source.readlines()

            document['src_file'] = self.file_path

            document['source_url'] = raw[0].strip()
            document['timestamp'] = parse_time(raw[1].strip())
            document['domain'] = urlparse(document['source_url']).netloc

            html = ''.join(raw[2:]).strip()

            text = extract(html)
            document['text'] = text
            document['tokens'] = tokenize(text)
            document['new_attr'] = 1
            # print document['tokens']
        except Exception as e:
            # print e, 'Some error has occured, continue'
            return False

        # print document['text']
        if self.verify_document(document):
            self.document = document
        else:
            return False
        return True

Exemple #4

0

Afficher le fichier

Fichier : content_extractor.py Projet : KeithYue/WebdataPipeline

    def parse_document(self):
        weibo = {}


        # parse the key words
        try:
            raw = self.source.read()
            weibo  = json.loads(raw)

            weibo['src_file'] = self.file_path # used to check duplicate insert

            full_dir_path = os.path.split(self.file_path)[0]
            dir_name = os.path.basename(full_dir_path)
            weibo['keywords'] = dir_name.split(' ')
            weibo['tokens'] = tokenize(weibo['content'])

            # construct timestamp
            actual_date_str = u'2014年'+weibo['date'].strip()
            time_tuple = time.strptime(actual_date_str, u'%Y年%m月%d日 %H:%M')
            weibo['timestamp'] = datetime.datetime(*time_tuple[0:6])

            # heat of weibo
            weibo['heat'] = weibo['heat'].replace('\n', '')
            pattern = re.compile(ur'.*转发\((?P<retweet_num>\d+)\).*')
            match = pattern.match(weibo['heat'])
            if match:
                weibo['retweet'] = int(match.groupdict()['retweet_num'])
            else:
                weibo['retweet'] = 0
        except ValueError:
            logging.error('value error')
            return False


        self.weibo = weibo
        # debug info
        print_dict(self.weibo)
        return True

Exemple #5

0

Afficher le fichier

Fichier : content_extractor.py Projet : anukat2015/WebdataPipeline

def update_socialmedial(src, dist):
    '''
    para src: the source collection name, string.
    para dist: the destination collection name, string.
    the weibo data is a little bit different, weibo data is already in the database
    '''
    try:
        cursor = db[src].find({'tokens': {'$exists': 0}}, timeout=False)
        cursor.batch_size(100)
        for weibo in cursor:
            try:
                text = weibo['value']['content']
                for comment in weibo['value']['comment_comment']:
                    text = '\n'.join([text, comment['comment']])
                # logging.info(text)
                tokens = tokenize(text)
                logging.info(
                    'Parsing {} collection with the document id {}'.format(
                        src, weibo['_id']))
                # logging.info('updating the weibo %s' % weibo['value']['mid'])
                weibo['text'] = text
                weibo['tokens'] = tokens
                weibo['timestamp'] = datetime.datetime.fromtimestamp(
                    float(weibo['value']['created_at']))
                # remove the _id key to remove the dulplicate key error
                del weibo['_id']
                # update the document to the weibo collection, not the original src collection
                # print(weibo)
                db[dist].update({'key': weibo['key']}, weibo,
                                True)  # use upsert to insert the document
            except Exception as e:
                logging.error(e)
                logging.error('updating failure, press enter to continue...')
                continue
    finally:
        cursor.close()
    return True

Exemple #6

0

Afficher le fichier

Fichier : content_extractor.py Projet : anukat2015/WebdataPipeline

    def parse_document(self):
        weibo = {}

        # parse the key words
        try:
            raw = self.source.read()
            weibo = json.loads(raw)

            weibo[
                'src_file'] = self.file_path  # used to check duplicate insert

            full_dir_path = os.path.split(self.file_path)[0]
            dir_name = os.path.basename(full_dir_path)
            weibo['keywords'] = dir_name.split(' ')
            weibo['tokens'] = tokenize(weibo['content'])

            # construct timestamp
            actual_date_str = u'2014年' + weibo['date'].strip()
            time_tuple = time.strptime(actual_date_str, u'%Y年%m月%d日 %H:%M')
            weibo['timestamp'] = datetime.datetime(*time_tuple[0:6])

            # heat of weibo
            weibo['heat'] = weibo['heat'].replace('\n', '')
            pattern = re.compile(ur'.*转发\((?P<retweet_num>\d+)\).*')
            match = pattern.match(weibo['heat'])
            if match:
                weibo['retweet'] = int(match.groupdict()['retweet_num'])
            else:
                weibo['retweet'] = 0
        except ValueError:
            logging.error('value error')
            return False

        self.weibo = weibo
        # debug info
        print_dict(self.weibo)
        return True