def parse_document(self): try: document = {} raw = self.source.readlines() document['src_file'] = self.file_path document['source_url'] = raw[0].strip() document['timestamp'] = parse_time(raw[1].strip()) document['domain'] = urlparse(document['source_url']).netloc html = ''.join(raw[2:]).strip() text = extract(html) document['text'] = text document['tokens'] = tokenize(text) document['new_attr'] = 1 # print document['tokens'] except Exception as e: # print e, 'Some error has occured, continue' return False # print document['text'] if self.verify_document(document): self.document = document else: return False return True
def update_socialmedial(src, dist): ''' para src: the source collection name, string. para dist: the destination collection name, string. the weibo data is a little bit different, weibo data is already in the database ''' try: cursor = db[src].find({'tokens': { '$exists': 0 }}, timeout=False) cursor.batch_size(100) for weibo in cursor: try: text = weibo['value']['content'] for comment in weibo['value']['comment_comment']: text = '\n'.join([text, comment['comment']]) # logging.info(text) tokens = tokenize(text) logging.info('Parsing {} collection with the document id {}'.format(src, weibo['_id'])) # logging.info('updating the weibo %s' % weibo['value']['mid']) weibo['text'] = text weibo['tokens'] = tokens weibo['timestamp'] = datetime.datetime.fromtimestamp(float(weibo['value']['created_at'])) # remove the _id key to remove the dulplicate key error del weibo['_id'] # update the document to the weibo collection, not the original src collection # print(weibo) db[dist].update({'key': weibo['key']}, weibo, True) # use upsert to insert the document except Exception as e: logging.error(e) logging.error('updating failure, press enter to continue...') continue finally: cursor.close() return True
def parse_document(self): weibo = {} # parse the key words try: raw = self.source.read() weibo = json.loads(raw) weibo['src_file'] = self.file_path # used to check duplicate insert full_dir_path = os.path.split(self.file_path)[0] dir_name = os.path.basename(full_dir_path) weibo['keywords'] = dir_name.split(' ') weibo['tokens'] = tokenize(weibo['content']) # construct timestamp actual_date_str = u'2014年'+weibo['date'].strip() time_tuple = time.strptime(actual_date_str, u'%Y年%m月%d日 %H:%M') weibo['timestamp'] = datetime.datetime(*time_tuple[0:6]) # heat of weibo weibo['heat'] = weibo['heat'].replace('\n', '') pattern = re.compile(ur'.*转发\((?P<retweet_num>\d+)\).*') match = pattern.match(weibo['heat']) if match: weibo['retweet'] = int(match.groupdict()['retweet_num']) else: weibo['retweet'] = 0 except ValueError: logging.error('value error') return False self.weibo = weibo # debug info print_dict(self.weibo) return True
def update_socialmedial(src, dist): ''' para src: the source collection name, string. para dist: the destination collection name, string. the weibo data is a little bit different, weibo data is already in the database ''' try: cursor = db[src].find({'tokens': {'$exists': 0}}, timeout=False) cursor.batch_size(100) for weibo in cursor: try: text = weibo['value']['content'] for comment in weibo['value']['comment_comment']: text = '\n'.join([text, comment['comment']]) # logging.info(text) tokens = tokenize(text) logging.info( 'Parsing {} collection with the document id {}'.format( src, weibo['_id'])) # logging.info('updating the weibo %s' % weibo['value']['mid']) weibo['text'] = text weibo['tokens'] = tokens weibo['timestamp'] = datetime.datetime.fromtimestamp( float(weibo['value']['created_at'])) # remove the _id key to remove the dulplicate key error del weibo['_id'] # update the document to the weibo collection, not the original src collection # print(weibo) db[dist].update({'key': weibo['key']}, weibo, True) # use upsert to insert the document except Exception as e: logging.error(e) logging.error('updating failure, press enter to continue...') continue finally: cursor.close() return True
def parse_document(self): weibo = {} # parse the key words try: raw = self.source.read() weibo = json.loads(raw) weibo[ 'src_file'] = self.file_path # used to check duplicate insert full_dir_path = os.path.split(self.file_path)[0] dir_name = os.path.basename(full_dir_path) weibo['keywords'] = dir_name.split(' ') weibo['tokens'] = tokenize(weibo['content']) # construct timestamp actual_date_str = u'2014年' + weibo['date'].strip() time_tuple = time.strptime(actual_date_str, u'%Y年%m月%d日 %H:%M') weibo['timestamp'] = datetime.datetime(*time_tuple[0:6]) # heat of weibo weibo['heat'] = weibo['heat'].replace('\n', '') pattern = re.compile(ur'.*转发\((?P<retweet_num>\d+)\).*') match = pattern.match(weibo['heat']) if match: weibo['retweet'] = int(match.groupdict()['retweet_num']) else: weibo['retweet'] = 0 except ValueError: logging.error('value error') return False self.weibo = weibo # debug info print_dict(self.weibo) return True