def on_data(self, data): try: if self.count < self.limit: tweet = json.loads(data) if tweet['lang'] == 'en' and tweet['user'].get('location') is not None: place = tweet['user'].get('location') if place: tweet_id = str(tweet['id']) geocode_result = gmaps.geocode(place) lat = geocode_result[0]['geometry']['location']['lat'] lng = geocode_result[0]['geometry']['location']['lng'] tweet_text = tweet['text'].lower().encode('ascii', 'ignore').decode('ascii') raw_tweet = { 'user': tweet['user']['screen_name'], 'text': tweet_text, 'place': place, 'coordinates': {'location': str(lat)+","+str(lng)}, 'time': tweet['created_at'], 'category': get_category(tweet_text) } es.index(index=ES_INDEX, doc_type=ES_TYPE, id=tweet_id, body=raw_tweet) self.count += 1 else: stream.disconnect() except Exception as e: pass
def review_revision(request, revision_id): revision = TextbookCompanionRevision.objects.using('scilab').get( id=revision_id) file = utils.get_file(revision.example_file.filepath, revision.commit_sha, main_repo=False) code = base64.b64decode(file['content']) request.session['revision_id'] = revision_id example = revision.example_file.example chapter = example.chapter book = chapter.preference category = utils.get_category(book.category) data = { 'code': code, 'revision': model_to_dict(revision), 'example': model_to_dict(example), 'chapter': model_to_dict(chapter), 'book': model_to_dict(book), 'category': category, 'createdAt': str(revision.timestamp), } return simplejson.dumps(data)
def main(): args = make_args() config = configparser.ConfigParser() utils.load_config(config, args.config) for cmd in args.modify: utils.modify_config(config, cmd) with open(os.path.expanduser(os.path.expandvars(args.logging)), 'r') as f: logging.config.dictConfig(yaml.load(f)) cache_dir = utils.get_cache_dir(config) os.makedirs(cache_dir, exist_ok=True) shutil.copyfile( os.path.expanduser(os.path.expandvars(config.get('cache', 'category'))), os.path.join(cache_dir, 'category')) category = utils.get_category(config) category_index = dict([(name, i) for i, name in enumerate(category)]) datasets = config.get('cache', 'datasets').split() for phase in args.phase: path = os.path.join(cache_dir, phase) + '.pkl' logging.info('save cache file: ' + path) data = [] for dataset in datasets: logging.info('load %s dataset' % dataset) module, func = dataset.rsplit('.', 1) module = importlib.import_module(module) func = getattr(module, func) data += func(config, path, category_index) if config.getboolean('cache', 'shuffle'): random.shuffle(data) with open(path, 'wb') as f: pickle.dump(data, f) logging.info('%s data are saved into %s' % (str(args.phase), cache_dir))
def main(): args = make_args() config = configparser.ConfigParser() utils.load_config(config, args.config) for cmd in args.modify: utils.modify_config(config, cmd) with open(os.path.expanduser(os.path.expandvars(args.logging)), 'r') as f: logging.config.dictConfig(yaml.load(f)) cache_dir = utils.get_cache_dir(config) os.makedirs(cache_dir, exist_ok=True) shutil.copyfile(os.path.expanduser(os.path.expandvars(config.get('cache', 'category'))), os.path.join(cache_dir, 'category')) category = utils.get_category(config) category_index = dict([(name, i) for i, name in enumerate(category)]) datasets = config.get('cache', 'datasets').split() for phase in args.phase: path = os.path.join(cache_dir, phase) + '.pkl' logging.info('save cache file: ' + path) data = [] for dataset in datasets: logging.info('load %s dataset' % dataset) module, func = dataset.rsplit('.', 1) module = importlib.import_module(module) func = getattr(module, func) data += func(config, path, category_index) if config.getboolean('cache', 'shuffle'): random.shuffle(data) with open(path, 'wb') as f: pickle.dump(data, f) logging.info('%s data are saved into %s' % (str(args.phase), cache_dir))
def main(): args = make_args() config = configparser.ConfigParser() utils.load_config(config, args.config) for cmd in args.modify: utils.modify_config(config, cmd) with open(os.path.expanduser(os.path.expandvars(args.logging)), 'r') as f: logging.config.dictConfig(yaml.load(f)) cache_dir = utils.get_cache_dir(config) model_dir = utils.get_model_dir(config) category = utils.get_category( config, cache_dir if os.path.exists(cache_dir) else None) anchors = utils.get_anchors(config) anchors = torch.from_numpy(anchors).contiguous() path, step, epoch = utils.train.load_model(model_dir) state_dict = torch.load(path, map_location=lambda storage, loc: storage) dnn = utils.parse_attr(config.get('model', 'dnn'))(model.ConfigChannels( config, state_dict), anchors, len(category)) dnn.load_state_dict(state_dict) height, width = tuple(map(int, config.get('image', 'size').split())) resize = transform.parse_transform(config, config.get('transform', 'resize_test')) transform_image = transform.get_transform( config, config.get('transform', 'image_test').split()) transform_tensor = transform.get_transform( config, config.get('transform', 'tensor').split()) # load image image_bgr = cv2.imread('image.jpg') image_resized = resize(image_bgr, height, width) image = transform_image(image_resized) tensor = transform_tensor(image).unsqueeze(0) # Checksum for key, var in dnn.state_dict().items(): a = var.cpu().numpy() print('\t'.join( map(str, [ key, a.shape, utils.abs_mean(a), hashlib.md5(a.tostring()).hexdigest() ]))) output = dnn(torch.autograd.Variable(tensor, volatile=True)).data for key, a in [ ('image_bgr', image_bgr), ('image_resized', image_resized), ('tensor', tensor.cpu().numpy()), ('output', output.cpu().numpy()), ]: print('\t'.join( map(str, [ key, a.shape, utils.abs_mean(a), hashlib.md5(a.tostring()).hexdigest() ])))
def process_cmd_comments(): try: queue_item = QUEUE_COMMENTS.get_nowait() except queue.Empty: return comment: Comment = queue_item[0] cmd_str = comment["body"] logger.debug(cmd_str) parsed_cmd = parse_command(cmd_str) if parsed_cmd is None: logger.info("No command found in %s", comment["url"]) QUEUE_COMMENTS.task_done() return if parsed_cmd["help"] is not None and comment["author"] != ACCOUNT: if not replied_to_comment(comment, ACCOUNT): if reply_message(comment, MESSAGES["HELP"], ACCOUNT): logger.info("Help message replied to %s", comment["url"]) else: logger.info("Couldn't reply to %s", comment["url"]) else: logger.info("Already replied with help command to %s", comment["url"]) QUEUE_COMMENTS.task_done() return if parsed_cmd["help"] is None and parsed_cmd.get("status") is None: if len([x for x in parsed_cmd if parsed_cmd[x] is not None ]) > 1 and not replied_to_comment(comment, ACCOUNT): if reply_message(comment, MESSAGES["STATUS_MISSING"], ACCOUNT): logger.info("Missing status parameter message sent to %s", comment["url"]) else: logger.info("Couldn't reply to %s", comment["url"]) QUEUE_COMMENTS.task_done() return root_comment = queue_item[1] category = get_category(root_comment, TASKS_PROPERTIES) if category is None: logger.info("No valid category found. %s", root_comment["url"]) QUEUE_COMMENTS.task_done() return if ACCOUNT: reply = replied_to_comment(root_comment, ACCOUNT) send_summary_to_steem(parsed_cmd, reply, root_comment) if DISCORD_WEBHOOK_TASKS: content = ( f'[{parsed_cmd["status"].upper()}] <{build_comment_link(root_comment)}>' ) embeds = [build_discord_tr_embed(root_comment, parsed_cmd)] send_message_to_discord(DISCORD_WEBHOOK_TASKS, content, embeds) QUEUE_COMMENTS.task_done()
def to_python(self, data_row): studygorup = dict( hours=data_row[1], start=data_row[2], end=data_row[3], ) studygroup_query = studygorup.copy() studygroup_query.update(dict( subject__short_name=data_row[0], department__name=data_row[4], )) subject = dict( short_name=data_row[0], ) department = dict( name=data_row[4], ) organization_name = data_row[6] match = re.findall(r'\d+', organization_name) organization_cast = get_organization_type(organization_name) organization = dict( name=organization_name, number=match[0] if match else None, cast=organization_cast, ) last_name, first_name, patronymic = data_row[5] listener_position=get_position_fuzzy(data_row[7]) listener = dict( first_name_inflated=first_name, last_name_inflated=last_name, patronymic_inflated=patronymic, position=listener_position, profile=get_profile_fuzzy(data_row[7]), category=get_category(organization_cast, listener_position), ) attestation_work_name = data_row[8] cert_number = data_row[9] return { 'studygroup': studygorup, 'studygroup_query': studygroup_query, 'department': department, 'subject': subject, 'listener': listener, 'organization': organization, 'attestation_work_name': attestation_work_name, 'cert_number': cert_number, }
def split_train_data(self, paths, ratio=None): ''' Get splitted data paths. ''' if ratio == None: ratio = 0.05 / (1 - 1.0 / len(self.fold_itens)) # Get data files. brains_data = Files("") brains_data.paths = paths brains_data = brains_data.get_file_names() total_brains = len(brains_data) validation_size = np.int(total_brains * ratio) # Get total of brains by category. brains_by_patient = group_brains_by_patient_id(brains_data) validation_paths = [] train_paths = [] brains_by_category = group_brains_by_category(brains_data) statistic = {} # Initialize statistic data. for label in brains_by_category: statistic[label] = np.round( len(brains_by_category[label]) * validation_size * 1.0 / total_brains) # Create train and validation set. for label in brains_by_category: index_brains = 0 while statistic[label] > 0: patient_id = get_patient_id( brains_by_category[label][index_brains]) brains_patient = brains_by_patient[patient_id] for brain_patient in brains_patient: validation_paths.append(brain_patient) statistic[get_category(brain_patient)] -= 1 index_brains += 1 for brain in brains_data: if brain not in validation_paths: train_paths.append(brain) # Shuffle data. np.random.shuffle(np.array(validation_paths)) np.random.shuffle(np.array(train_paths)) return train_paths, validation_paths
def main(): args = make_args() config = configparser.ConfigParser() utils.load_config(config, args.config) for cmd in args.modify: utils.modify_config(config, cmd) with open(os.path.expanduser(os.path.expandvars(args.logging)), 'r') as f: logging.config.dictConfig(yaml.load(f)) model_dir = utils.get_model_dir(config) category = utils.get_category(config) anchors = torch.from_numpy(utils.get_anchors(config)).contiguous() path, step, epoch = utils.train.load_model(model_dir) state_dict = torch.load(path, map_location=lambda storage, loc: storage) _model = utils.parse_attr(config.get('model', 'dnn')) dnn = _model(model.ConfigChannels(config, state_dict), anchors, len(category)) logging.info( humanize.naturalsize( sum(var.cpu().numpy().nbytes for var in dnn.state_dict().values()))) dnn.load_state_dict(state_dict) height, width = tuple(map(int, config.get('image', 'size').split())) image = torch.autograd.Variable( torch.randn(args.batch_size, 3, height, width)) output = dnn(image) state_dict = dnn.state_dict() d = utils.dense(state_dict[args.name]) keep = torch.LongTensor(np.argsort(d)[:int(len(d) * args.keep)]) modifier = utils.channel.Modifier( args.name, state_dict, dnn, lambda name, var: var[keep], lambda name, var, mapper: var[mapper(keep, len(d))], debug=args.debug, ) modifier(output.grad_fn) if args.debug: path = modifier.dot.view( '%s.%s.gv' % (os.path.basename(model_dir), os.path.basename(os.path.splitext(__file__)[0])), os.path.dirname(model_dir)) logging.info(path) assert len(keep) == len(state_dict[args.name]) dnn = _model(model.ConfigChannels(config, state_dict), anchors, len(category)) dnn.load_state_dict(state_dict) dnn(image) if not args.debug: torch.save(state_dict, path)
def main(): while True: try: queue_item = QUEUE_COMMENTS.get_nowait() except queue.Empty: continue comment: Comment = queue_item[0] cmd_str = comment["body"] LOGGER.debug(cmd_str) parsed_cmd = parse_command(cmd_str) if parsed_cmd is None: LOGGER.info("No command found") QUEUE_COMMENTS.task_done() continue elif parsed_cmd["help"] is not None and comment["author"] != ACCOUNT: replied = False for reply in comment.get_replies(): if reply["author"] == ACCOUNT: LOGGER.info("Already replied with help command. %s", comment["url"]) replied = True break if not replied: send_help_message(comment, ACCOUNT) QUEUE_COMMENTS.task_done() continue if parsed_cmd.get("status") is None: if len([x for x in parsed_cmd if parsed_cmd[x] is not None]) > 1: send_missing_status_message(comment, ACCOUNT) QUEUE_COMMENTS.task_done() continue root_comment = queue_item[1] category = get_category(root_comment, TASKS_PROPERTIES) if category is None: LOGGER.info("No valid category found. %s", root_comment["url"]) QUEUE_COMMENTS.task_done() continue category = TASKS_PROPERTIES[category]["category"] webhook = DiscordWebhook( url=DISCORD_WEBHOOK_TASKS, content=f'[{category.upper()}][{parsed_cmd["status"].upper()}] <{build_comment_link(root_comment)}>', ) webhook.add_embed(build_discord_tr_embed(root_comment, parsed_cmd)) webhook.execute() QUEUE_COMMENTS.task_done()
def follow_euro_2016(user_data, callback, data, delta=60): # logging.info('following Euro2016') # goal at 22:17 current_time = dateutil.parser.parse('06/10/2016 22:14:00 +0200') current_counter = Counter() last_goal = None min_time = datetime.timedelta(minutes=4) previous_score = 0, 0 while True: tweets = get_tweets_around(current_time, data, delta) categories = [get_category(tweet['text']) for tweet in tweets] counter = Counter(categories) # logging.info('%s %s', current_time, counter) if (counter['BUT'] > current_counter['BUT'] + 50 and (last_goal is None or current_time >= last_goal + min_time)): # for tweet in random.sample(tweets, 10): # logging.debug(tweet['text']) scores = parse_tweets(tweets) scores = Counter(scores) if scores: country1, score1, country2, score2 = max(scores, key=scores.get) score1 = int(score1) score2 = int(score2) prev_score1, prev_score2 = previous_score if score1 - prev_score1 + score2 - prev_score2 == 1: scorer = country1 if score1 > prev_score1 else country2 previous_score = score1, score2 yield country1, score1, country2, score2, scorer last_goal = current_time time.sleep(delta) current_time += datetime.timedelta(seconds=delta) current_counter = counter
def main(): args = make_args() config = configparser.ConfigParser() utils.load_config(config, args.config) for cmd in args.modify: utils.modify_config(config, cmd) with open(os.path.expanduser(os.path.expandvars(args.logging)), 'r') as f: logging.config.dictConfig(yaml.load(f)) model_dir = utils.get_model_dir(config) category = utils.get_category(config) anchors = torch.from_numpy(utils.get_anchors(config)).contiguous() try: path, step, epoch = utils.train.load_model(model_dir) state_dict = torch.load(path, map_location=lambda storage, loc: storage) except (FileNotFoundError, ValueError): logging.warning('model cannot be loaded') state_dict = None dnn = utils.parse_attr(config.get('model', 'dnn'))(model.ConfigChannels( config, state_dict), anchors, len(category)) logging.info( humanize.naturalsize( sum(var.cpu().numpy().nbytes for var in dnn.state_dict().values()))) if state_dict is not None: dnn.load_state_dict(state_dict) height, width = tuple(map(int, config.get('image', 'size').split())) image = torch.autograd.Variable( torch.randn(args.batch_size, 3, height, width)) output = dnn(image) state_dict = dnn.state_dict() graph = utils.visualize.Graph(config, state_dict) graph(output.grad_fn) diff = [key for key in state_dict if key not in graph.drawn] if diff: logging.warning('variables not shown: ' + str(diff)) path = graph.dot.view( os.path.basename(model_dir) + '.gv', os.path.dirname(model_dir)) logging.info(path)
def __getitem__(self, idx): i = idx * batch_size length = min(batch_size, (len(self.ids) - i)) X = np.empty((length, img_rows, img_cols, 3), dtype=np.float32) Y = np.empty((length, img_rows, img_cols, num_classes), dtype=np.float32) for i_batch in range(length): id = self.ids[i + i_batch] name = self.names[id] image = get_image(name) category = get_category(id) image, category = random_crop(image, category) image = cv.cvtColor(image, cv.COLOR_BGR2RGB) X[i_batch] = image Y[i_batch] = to_categorical(category, num_classes) X = preprocess_input(X) return X, Y
def main(): args = make_args() config = configparser.ConfigParser() utils.load_config(config, args.config) for cmd in args.modify: utils.modify_config(config, cmd) with open(os.path.expanduser(os.path.expandvars(args.logging)), 'r') as f: logging.config.dictConfig(yaml.load(f)) model_dir = utils.get_model_dir(config) category = utils.get_category(config) anchors = torch.from_numpy(utils.get_anchors(config)).contiguous() path, step, epoch = utils.train.load_model(model_dir) state_dict = torch.load(path, map_location=lambda storage, loc: storage) dnn = utils.parse_attr(config.get('model', 'dnn'))(model.ConfigChannels( config, state_dict), anchors, len(category)) logging.info( humanize.naturalsize( sum(var.cpu().numpy().nbytes for var in dnn.state_dict().values()))) dnn.load_state_dict(state_dict) height, width = tuple(map(int, config.get('image', 'size').split())) image = torch.autograd.Variable( torch.randn(args.batch_size, 3, height, width)) output = dnn(image) state_dict = dnn.state_dict() closure = utils.walk.Closure(args.name, state_dict, type(dnn).scope, args.debug) closure(output.grad_fn) d = utils.dense(state_dict[args.name]) channels = torch.LongTensor(np.argsort(d)[int(len(d) * args.remove):]) utils.walk.prune(closure, channels) if args.debug: path = closure.dot.view( os.path.basename(model_dir) + '.gv', os.path.dirname(model_dir)) logging.info(path) else: torch.save(state_dict, path)
def startSpiderWap(self): if self.spider_queue.empty(): fetched_users = self.db.execute( 'SELECT * from spider_list ORDER BY weight DESC limit 0,30') if fetched_users <= 0: print 'nothing to spider,spider_list is empty' return False self.start = 'start' self.errno = ERR_NO fetchall = self.db.fetchall() # 将数据库中取出的待爬取的分享者,加入爬取队列 for item in fetchall: self.spider_queue.put({ 'sid': item[0], 'uk': item[1], 'file_fetched': item[2], 'follow_fetched': item[3], 'follow_done': item[4], 'file_done': item[5], 'weight': item[6], 'uid': item[7] }) self.got_follow_count = 0 self.got_files_count = 0 self.while_count = 0 while not self.spider_queue.empty(): self.while_count += 1 share_user = self.spider_queue.get() # 爬取分享者的文件列表 if not share_user['file_done']: print '%d now spidering file ,%d file fetched' % ( share_user['uk'], share_user['file_fetched']) rs = self.getShareListsWap(share_user['uk'], share_user['file_fetched']) if not rs: print 'uk:%d error to fetch files,try again later...' % share_user[ 'uk'] return True total_count, fetched_count, file_list = rs total_fetched = share_user['file_fetched'] + fetched_count print 'fetched_file_count:%d' % fetched_count if total_fetched >= total_count or total_count == 0: share_user['file_done'] = 1 # 该分享者所有文件爬取完成 if total_count == 0: self.db.execute( "UPDATE spider_list set file_done=%s WHERE sid=%s", (1, share_user['sid'])) self.db.commit() else: try: files_count = 0 for file in file_list: files_count += 1 ext = '' file_type = '' file_type_i = -1 if file['isdir'] == 0 and file[ 'feed_type'] == 'share': ext = utils.get_extension( file['title']).lower() file_type = utils.get_category(ext) file_type_i = self.file_type_t[file_type] time_stamp = int(time.time()) self.db.execute( "INSERT INTO share_file (title,uk,shareid,shorturl,isdir,size,md5,ext,feed_time,create_time,file_type,uid,feed_type) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", (file['title'], file['uk'], file['shareid'], file['shorturl'], file['isdir'], file['size'], file['md5'], ext, file['feed_time'], time_stamp, file_type_i, share_user['uid'], file['feed_type'])) except: share_user['file_done'] = 0 self.db.rollback() traceback.print_exc() return False else: self.db.execute( "UPDATE spider_list set file_fetched=%s,file_done=%s WHERE sid=%s", (total_fetched, share_user['file_done'], share_user['sid'])) self.db.execute( "UPDATE share_users set fetched=%s WHERE uid=%s", (total_fetched, share_user['uid'])) share_user['file_fetched'] = total_fetched self.got_files_count += files_count self.db.commit() # 爬取完文件后在爬取订阅列表,wap暂时不爬取 if share_user['follow_done'] == 0 and share_user['file_done'] == 1: share_user['follow_done'] = 1 print '删除用户:%d' % share_user['sid'] self.db.execute("DELETE FROM spider_list WHERE sid=%s", (share_user['sid'], )) self.db.commit() time.sleep(SPIDER_INTERVAL) print '-----------------Done------------------' print 'while_count:%d' % self.while_count print 'got_follow_count:%d' % self.got_follow_count print 'got_files_count:%d' % self.got_files_count return True
def build_discord_tr_embed(comment: dict, cmds_args: dict) -> DiscordEmbed: """Creates a Discord embed for a Utopian task request. :param comment: Steem root post with task request :type comment: dict :param cmds_args: Parsed bot commands and arguments :type cmds_args: dict """ category = get_category(comment, TASKS_PROPERTIES) color = 0 type_ = None thumbnail = None if category is not None: color = int(TASKS_PROPERTIES[category]["color"][1:], 16) type_ = TASKS_PROPERTIES[category]["category"] thumbnail = TASKS_PROPERTIES[category]["image_url"] title = f'{comment["title"]}' description_parts = [] if cmds_args.get("description") is not None: description_parts.append(cmds_args["description"].strip()) # description_parts.append( # f'*You can read [here]({build_comment_link(comment)}) the whole task by **{comment["author"]}**.*' # ) description = "\n\n".join(description_parts) embed = DiscordEmbed(title=title, description=description) author = Account(comment["author"]) embed.set_author( name=author.name, url=f"{UI_BASE_URL}/@{author.name}", icon_url=author.profile.get("profile_image"), ) embed.set_color(color) embed.set_footer(text="Verified by Utopian.io team") embed.set_thumbnail(url=thumbnail) embed.set_timestamp() if type_ is not None: embed.add_embed_field(name="Task Type", value=type_.upper(), inline=True) status = None if cmds_args.get("status") is not None: status = cmds_args["status"] embed.add_embed_field(name="Status", value=status.upper(), inline=True) if status and status.upper() == "CLOSED": return embed if cmds_args.get("skills"): skills = normalize_str(cmds_args["skills"]) embed.add_embed_field(name="Required skills", value=skills, inline=True) if cmds_args.get("discord") is not None: embed.add_embed_field( name="Discord", value=f'{cmds_args["discord"]}', inline=True ) if cmds_args.get("bounty"): bounty = normalize_str(cmds_args["bounty"]).upper() else: bounty = "See the task details" embed.add_embed_field(name="Bounty", value=bounty, inline=True) if cmds_args.get("deadline"): deadline = cmds_args["deadline"] else: deadline = "Not specified" embed.add_embed_field(name="Due date", value=deadline, inline=True) is_in_progress = status and status.upper() == "IN PROGRESS" if is_in_progress and cmds_args.get("assignees"): assignees = normalize_str(cmds_args["assignees"]).lower() assignees_links = accounts_str_to_md_links(assignees) embed.add_embed_field(name="Assignees", value=assignees_links, inline=False) if cmds_args.get("note") is not None: embed.add_embed_field(name="Misc", value=f'{cmds_args["note"]}', inline=False) return embed
def extract_categories(self): for ex in self.transcripts: self.categories.add(utils.get_category(ex))
def add_category_features(cls, ex, categories, features): category = utils.get_category(ex) one_hot = [1 if k == category else 0 for k in categories] features.extend(one_hot)
if __name__ == '__main__': logging.basicConfig(level=logging.INFO) doc2vec_model = Doc2Vec.load(model_filename) with open(data_filename, 'rb') as f: documents = cPickle.load(f) train_test_ratio = 0.8 messages = [' '.join(doc.words) for doc in documents] categories = {} for message in messages: category = get_category(message) categories.setdefault(category, []).append(message) # for category, keywords_ in keywords.iteritems(): # pass import IPython IPython.embed() buts = [doc for doc in documents if doc.event == 'BUT'] riens = [doc for doc in documents if doc.event == 'rien'][:len(buts)] documents = buts + riens random.shuffle(documents) train_size = int(len(documents) * train_test_ratio) train_set = documents[:train_size]
logging.basicConfig(level=logging.INFO) doc2vec_model = Doc2Vec.load(model_filename) with open(data_filename, 'rb') as f: documents = cPickle.load(f) train_test_ratio = 0.8 messages = [ ' '.join(doc.words) for doc in documents ] categories = {} for message in messages: category = get_category(message) categories.setdefault(category, []).append(message) # for category, keywords_ in keywords.iteritems(): # pass import IPython; IPython.embed() buts = [doc for doc in documents if doc.event == 'BUT'] riens = [doc for doc in documents if doc.event == 'rien'][:len(buts)] documents = buts + riens random.shuffle(documents) train_size = int(len(documents) * train_test_ratio) train_set = documents[:train_size]
for tweet in ts.search_tweets_iterable(tso): try: if tweet['user'].get('location') is not None: place = tweet['user'].get('location') if place: tweet_id = str(tweet['id']) geocode_result = gmaps.geocode(place) lat = geocode_result[0]['geometry']['location']['lat'] lng = geocode_result[0]['geometry']['location']['lng'] tweet_text = tweet['text'].lower().encode( 'ascii', 'ignore').decode('ascii') raw_tweet = { 'user': tweet['user']['screen_name'], 'text': tweet_text, 'place': place, 'coordinates': { 'location': str(lat) + "," + str(lng) }, 'time': tweet['created_at'], 'category': get_category(tweet_text) } es.index(index=ES_INDEX, doc_type=ES_TYPE, id=tweet_id, body=raw_tweet) except Exception as e: print e continue except TwitterSearchException as e: print(e)
iaa.Multiply((0.8, 1.2), per_channel=0.2), ], random_order=True) if __name__ == '__main__': with open('names.txt', 'r') as f: names = f.read().splitlines() filename = 'valid_ids.txt' with open(filename, 'r') as f: ids = f.read().splitlines() ids = list(map(int, ids)) id = random.choice(ids) name = names[id] image = get_image(name) category = get_category(id) image = cv.resize(image, (img_rows, img_cols), cv.INTER_NEAREST) category = cv.resize(category, (img_rows, img_cols), cv.INTER_NEAREST) length = 10 images = np.zeros((length, img_rows, img_cols, 3), np.uint8) categories = np.zeros((length, img_rows, img_cols), np.uint8) for i in tqdm(range(length)): images[i] = image.copy() categories[i] = category.copy() images_aug = seq_img.augment_images(images) images_aug = seq_det.augment_images(images_aug) categories_aug = seq_det.augment_images(categories)
def build_discord_tr_embed(comment: dict, cmds_args: dict) -> DiscordEmbed: """Creates a Discord embed for a Utopian task request. :param comment: Steem root post with task request :type comment: dict :param cmds_args: Parsed bot commands and arguments :type cmds_args: dict """ category = get_category(comment, TASKS_PROPERTIES) color = 0 type_ = None thumbnail = None if category is not None: color = int(TASKS_PROPERTIES[category]["color"][1:], 16) type_ = TASKS_PROPERTIES[category]["category"] thumbnail = TASKS_PROPERTIES[category]["image_url"] title = f'{comment["title"]}' description = None if cmds_args.get("description"): description = cmds_args["description"] embed = DiscordEmbed(title=title, description=description) author = comment["author"] embed.set_author( name=author, url=f"{UI_BASE_URL}/@{author}", icon_url=f"https://steemitimages.com/u/{author}/avatar", ) embed.set_color(color) embed.set_footer(text="Verified by Utopian.io team") embed.set_thumbnail(url=thumbnail) embed.set_timestamp() if type_ is not None: embed.add_embed_field(name="Task Type", value=type_.upper(), inline=True) status = None if cmds_args.get("status") is not None: status = cmds_args["status"] embed.add_embed_field(name="Status", value=status.upper(), inline=True) if status and status.upper() == "CLOSED": return embed if cmds_args.get("skills"): skills = ", ".join(cmds_args["skills"]) embed.add_embed_field(name="Required skills", value=skills, inline=True) if cmds_args.get("discord"): embed.add_embed_field(name="Discord", value=f'{cmds_args["discord"]}', inline=True) if cmds_args.get("bounty"): bounty = ", ".join(cmds_args["bounty"]) else: bounty = "See the task details" embed.add_embed_field(name="Bounty", value=bounty, inline=True) deadline = cmds_args.get("deadline") if not deadline: deadline = "Not specified" embed.add_embed_field(name="Due date", value=deadline, inline=True) is_in_progress = status and status.upper() == "IN PROGRESS" if is_in_progress and cmds_args.get("assignees"): assignees = ", ".join([f"@{a}" for a in cmds_args["assignees"]]) assignees_links = accounts_str_to_md_links(assignees) embed.add_embed_field(name="Assignees", value=assignees_links, inline=False) if cmds_args.get("note") is not None: embed.add_embed_field(name="Misc", value=f'{cmds_args["note"]}', inline=False) return embed
def startSpider(self): if self.spider_queue.empty(): fetched_users = self.db.execute('SELECT * from spider_list ORDER BY weight DESC limit 0,20') if fetched_users <= 0: print('nothing to spider,spider_list is empty') return False self.start = 'start' self.errno = ERR_NO fetchall = self.db.fetchall() # 将数据库中取出的待爬取的分享者,加入爬取队列 for item in fetchall: self.spider_queue.put({ 'sid': item[0], 'uk': item[1], 'file_fetched': item[2], 'follow_fetched': item[3], 'follow_done': item[4], 'file_done': item[5], 'weight': item[6], 'uid': item[7] }) self.got_follow_count = 0 self.got_files_count = 0 self.while_count = 0 while not self.spider_queue.empty(): self.while_count += 1 share_user = self.spider_queue.get() # 爬取分享者的文件列表 if not share_user['file_done']: print('%d now spidering file ,%d file fetched' % (share_user['uk'], share_user['file_fetched'])) rs = self.getShareLists(share_user['uk'], share_user['file_fetched']) if not rs: print('uk:%d error to fetch files,try again later...' % share_user['uk']) return True total_count, fetched_count, file_list = rs total_fetched = share_user['file_fetched'] + fetched_count print('fetched_file_count:%d' % fetched_count) if total_fetched >= total_count or total_count == 0: share_user['file_done'] = 1 # 该分享者所有文件爬取完成 if total_count == 0: self.db.execute("UPDATE spider_list set file_done=%s WHERE sid=%s", (1, share_user['sid'])) self.db.commit() else: try: files_count = 0 for file in file_list: files_count += 1 ext = '' file_type = '' file_type_i = -1 if file['isdir'] == 0 and file['feed_type'] == 'share': ext = utils.get_extension(file['title']).lower() file_type = utils.get_category(ext) file_type_i = self.file_type_t[file_type] time_stamp = int(time.time()) self.db.execute( "INSERT INTO share_file (title,uk,shareid,shorturl,isdir,size,md5,ext,feed_time,create_time,file_type,uid,feed_type) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", (file['title'], file['uk'], file['shareid'], file['shorturl'], file['isdir'], file['size'], file['md5'], ext, file['feed_time'], time_stamp, file_type_i, share_user['uid'], file['feed_type']) ) except: share_user['file_done'] = 0 self.db.rollback() traceback.print_exc() return False else: self.db.execute("UPDATE spider_list set file_fetched=%s,file_done=%s WHERE sid=%s", (total_fetched, share_user['file_done'], share_user['sid'])) self.db.execute("UPDATE share_users set fetched=%s WHERE uid=%s", (total_fetched, share_user['uid'])) share_user['file_fetched'] = total_fetched self.got_files_count += files_count self.db.commit() # 爬取完文件后在爬取订阅列表 if share_user['follow_done'] == 0 and share_user['file_done'] == 1: print('%d now spidering follow ,%d follow fetched' % (share_user['uk'], share_user['follow_fetched'])) rs = self.getFollows(share_user['uk'], share_user['follow_fetched']) if not rs: print('error to fetch follows,try again later...') return total_count, fetched_count, follow_list = rs total_fetched = share_user['follow_fetched'] + fetched_count print('fetched_follow_count:%d' % fetched_count) if total_fetched >= total_count or total_count == 0: share_user['follow_done'] = 1 if total_count == 0: self.db.execute("DELETE FROM spider_list WHERE sid=%s", (share_user['sid'],)) self.db.commit() else: try: follow_count = 0 for follow in follow_list: follow_count += 1 # 判断该用户是否已经在表中了 if self.db.execute('SELECT * FROM share_users WHERE uk=%s', (follow['follow_uk'],)) > 0: print('uk:%d has already in share_user table' % follow['follow_uk']) continue time_stamp = int(time.time()) self.db.execute("INSERT INTO share_users (uk,user_name,avatar_url,intro,follow_count,album_count,\ fens_count,pubshare_count,last_visited,create_time,weight) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", ( follow['follow_uk'], follow['follow_uname'], follow['avatar_url'], follow['intro'], follow['follow_count'], follow['album_count'], follow['fans_count'], follow['pubshare_count'], time_stamp, time_stamp, 5 ) ) # 将获取的新分享者加入爬取列表 self.db.execute("INSERT INTO spider_list (uk,uid) VALUES(%s,%s)", (follow['follow_uk'], self.db.last_row_id())) except: share_user['follow_done'] = 0 self.db.rollback() traceback.print_exc() return False else: if share_user['follow_done'] == 1: # 订阅者爬取完成,该分享者的任务完成,从待爬取列表中删除 print('delete follow fetched sid:%d from spider_list' % share_user['sid']) self.db.execute("DELETE FROM spider_list WHERE sid=%s", (share_user['sid'],)) else: self.db.execute("UPDATE spider_list set follow_fetched=%s,follow_done=%s WHERE sid=%s", (total_fetched, share_user['follow_done'], share_user['sid'])) share_user['follow_fetched'] = total_fetched self.got_follow_count += follow_count self.db.commit() # 只要分享者列表没完成,说明该分享者还未爬取完,则加入工作队列,继续爬取 if share_user['follow_done'] == 0: self.spider_queue.put(share_user) else: print('%d has done' % share_user['uk']) del share_user time.sleep(SPIDER_INTERVAL) print('-----------------Done------------------') print('while_count:%d' % self.while_count) print('got_follow_count:%d' % self.got_follow_count) print('got_files_count:%d' % self.got_files_count) return True
confirm_button = st.checkbox('GO!') if confirm_button: show_covid_feature_relationship(group_dict, sub_feature_list) # 1.1.2 show how one is affected by multiple show_covid_feature_multi_relationship(total_covid_feature, yelp_covid_bool_df) # city # geometric interactive, state/city st.write("## 3. How Businesses' affect their reaction?") st.markdown("Let's now explore how businesses of different categories behave. \ We start by looking at whether different categories react differently with the above COVID features." ) business_category_info = get_category(yelp_join) show_business_in_category(yelp_covid_bool_df, business_category_info) st.markdown("### 3.1 How long do they plan to close") close_for_how_long(yelp_join) st.markdown("### 3.2 What do Covid Banner say") what_covid_banner_say(yelp_join, business_category_info) st.markdown("### 3.3 What are in the highlights") business_highlight_info = get_highlight_info(yelp_join) what_are_highlights(business_highlight_info) st.markdown("## 2. How Businesses' location affect their reaction?") total_targets = yelp_covid_bool_df.columns[1:]