class TweetDB(): def __init__(self): conf = Configuration() self.ptext = TextProcess(conf) self.ds = DataSet(conf) self.mongo = MongoDB(self.ds.db,self.ds.collection) self.tweet="" self.tokens = "" self.i = 0 self.enable_translation = self.ptext.translation self.translation_store = self.ptext.translation_store def get_tweet_from_db(self): where = { "text":{"$exists":"true"}, "geo.coordinates":{"$exists":"true"} } select = {"text":1,"source":1,"geo":1, "user":1,"retweet_count":1,"created_at":1} results = self.mongo.find(where,select) return results def process_tweets(self): tweets = self.get_tweet_from_db() for rawTweet in tweets: if "text" in rawTweet: tokens = {} self.ptext.set_tweet_text(rawTweet['text']) self.ptext.set_tweet_source(rawTweet['source']) self.ptext.process_text() rawTweet['source'] = self.ptext.get_tweet_source() rawTweet['text'] = self.ptext.get_tweet_text() self.tokens = self.ptext.get_tweet_tokens() tokens['tokens'] = self.tokens rawTweet.update(tokens) self.tweet = self.cleaner.unset_tweet_keys(rawTweet) if not self.ptext.get_translate_status(): self.ds.output_tweet(self.tweet) self.i += 1 else: if self.translation_store: if self.enable_translation: if not self.ptext.get_translate_failed(): self.ds.output_tweet(self.tweet) self.i += 1 else: self.ds.output_tweet(self.tweet) self.i += 1 def get_tweet_count(self): return self.i
print(f"Processing {location_name}") filter = {"gender": {"$exists": False}} print(f"Started at {datetime.now()}") counted_documents = collection.count_documents(filter, hint="gender_1") counter = 0 print(f"Needs to be processed {counted_documents}") pb = ProgressBar(total=counted_documents, prefix=location_name, decimals=3, length=50, fill='X', zfill='-') for item in collection.find( filter, no_cursor_timeout=True, projection=["uuid", "seller", "seller.name"], batch_size=100): counter += 1 pb.print_progress_bar(counter) if item['uuid'] and 'seller' in item and item['seller']['name']: uuid = item['uuid'] if item['seller']['name']: name_request_json = {"name": item['seller']['name']} r = requests.post(GENDER_RESOLVER_HOST, json=name_request_json) if r.status_code == 200: gender = r.json()['gender'] if gender != "UNKNOWN": gender = gender.lower() collection.update_one({"uuid": uuid},
class Collector(threading.Thread): def __init__(self, tab_images): super(Collector, self).__init__() self._lock = threading.RLock() self._db = MongoDB() self._thread_stop = False self._images = [] self._null_times = 0 self._read_pos = -1 self._write_pos = -1 self._tab_images = tab_images self._max_size = int(tools.get_conf_value('config.conf', "image_collector", "max_size")) self._interval = int(tools.get_conf_value('config.conf', "image_collector", "sleep_time")) self._allowed_null_times = int(tools.get_conf_value('config.conf', "image_collector", 'allowed_null_times')) self._image_count = int(tools.get_conf_value('config.conf', "image_collector", "images_count")) #初始时将正在做的任务至为未做 self._db.update(self._tab_images, {'image_pron_status':Constance.DOING}, {'image_pron_status':Constance.TODO}) self._db.set_ensure_index(self._tab_images, 'image_pron_status') self._finished_callback = None def run(self): log.debug('collector start ...') while not self._thread_stop: self.__input_data() time.sleep(self._interval) def stop(self): self._thread_stop = True if self._finished_callback: self._finished_callback() # @tools.log_function_time def __input_data(self): # log.debug('read_pos %d, write_pos %d buffer size %d'%(self._read_pos, self._write_pos, self.get_max_read_size())) # log.debug('buffer can write size = %d'%self.get_max_write_size()) if self.get_max_write_size() == 0: log.debug("collector 已满 size = %d"%self.get_max_read_size()) return url_count = self._image_count if self._image_count <= self.get_max_write_size() else self.get_max_write_size() images_list = [] images_list = self._db.find(self._tab_images, {"image_pron_status":Constance.TODO}, limit = url_count) #更新已取到的url状态为doing for url in images_list: self._db.update(self._tab_images, url, {'image_pron_status':Constance.DOING}) # 存url self.put_images(images_list) # if self.is_all_have_done(): # self.stop() def is_finished(self): return self._thread_stop def add_finished_callback(self, callback): self._finished_callback = callback # 没有可做的url def is_all_have_done(self): if self.get_max_read_size() == 0: self._null_times += 1 if self._null_times >= self._allowed_null_times: #检查数据库中有没有正在做的url images_doing = self._db.find(self._tab_images, {'status':Constance.DOING}) if images_doing: self._null_times = 0 return False else: return True else: self._null_times = 0 return False def get_max_write_size(self): size = 0 if self._read_pos == self._write_pos: size = self._max_size elif self._read_pos < self._write_pos: size = self._max_size - (self._write_pos - self._read_pos) else: size = self._read_pos - self._write_pos return size - 1 def get_max_read_size(self): return self._max_size -1 - self.get_max_write_size() # @tools.log_function_time def put_images(self, images_list): if images_list == []: return # 添加url 到 _images url_count = len((images_list)) end_pos = url_count + self._write_pos + 1 # 判断是否超出队列容量 超出的话超出的部分需要从头写 # 超出部分 overflow_end_pos = end_pos - self._max_size # 没超出部分 in_pos = end_pos if end_pos <= self._max_size else self._max_size # 没超出部分的数量 images_listCutPos = in_pos - self._write_pos - 1 self._lock.acquire() #加锁 self._images[self._write_pos + 1 : in_pos] = images_list[:images_listCutPos] if overflow_end_pos > 0: self._images[:overflow_end_pos] = images_list[images_listCutPos:] self._lock.release() self._write_pos += url_count self._write_pos %= self._max_size # -1 取余时问题 -1 % 1000 = 999 这样can write size 为0 images_list为空时返回 规避了这个问题 # @tools.log_function_time def get_images(self, count): self._lock.acquire() #加锁 images = [] count = count if count <= self.get_max_read_size() else self.get_max_read_size() end_pos = self._read_pos + count + 1 if end_pos > self._max_size: images.extend(self._images[self._read_pos + 1:]) images.extend(self._images[: end_pos % self._max_size]) else: images.extend(self._images[self._read_pos + 1: end_pos]) if images: self._read_pos += len(images) self._read_pos %= self._max_size self._lock.release() return images # db.getCollection('LiveApp_anchor_info').update({}, {$set:{'sexy_image_status':'', 'sexy_image_url':'', 'image_pron_status':0}}, false, true)
from db_queries.geo import vasilievskiy_ostrov from db_queries.people import males, persons from locations import LocationManager from mongodb import MongoDB if __name__ == '__main__': location_name = "SAINT-PETERSBURG" location = LocationManager().get_location(location_name) mongoDB = MongoDB(location.detailedCollectionName) filter = { "$and": [ vasilievskiy_ostrov, males, persons ] } distinct_r = mongoDB.find(filter=filter) print(f"Unique ads {len(distinct_r)}") name = "himki" # CsvGenerator.write_into_csv_file(distinct_r, f"{name}_{len(distinct_r)}.txt", ["phoneNumber"])
class TweetDB(): def __init__(self): conf = Configuration() self.ptext = TextProcess(conf) self.ds = DataSet(conf) self.mongo = MongoDB(self.ds.db, self.ds.collection) self.tweet = "" self.tokens = "" self.i = 0 self.enable_translation = self.ptext.translation self.translation_store = self.ptext.translation_store def get_tweet_from_db(self): where = { "text": { "$exists": "true" }, "geo.coordinates": { "$exists": "true" } } select = { "text": 1, "source": 1, "geo": 1, "user": 1, "retweet_count": 1, "created_at": 1 } results = self.mongo.find(where, select) return results def process_tweets(self): tweets = self.get_tweet_from_db() for rawTweet in tweets: if "text" in rawTweet: tokens = {} self.ptext.set_tweet_text(rawTweet['text']) self.ptext.set_tweet_source(rawTweet['source']) self.ptext.process_text() rawTweet['source'] = self.ptext.get_tweet_source() rawTweet['text'] = self.ptext.get_tweet_text() self.tokens = self.ptext.get_tweet_tokens() tokens['tokens'] = self.tokens rawTweet.update(tokens) self.tweet = self.cleaner.unset_tweet_keys(rawTweet) if not self.ptext.get_translate_status(): self.ds.output_tweet(self.tweet) self.i += 1 else: if self.translation_store: if self.enable_translation: if not self.ptext.get_translate_failed(): self.ds.output_tweet(self.tweet) self.i += 1 else: self.ds.output_tweet(self.tweet) self.i += 1 def get_tweet_count(self): return self.i