def index_source(): """ index new domains into our source table """ query = """ select as2.domain from (select distinct "domain" from apis_story) as2 left join apis_source src on as2."domain" = src."name" where src."name" is null """ new_sources = connect(query) values = [] logging.info("{} new domains".format(len(new_sources))) for source in new_sources: values.append(((str(uuid.uuid4()), source[0], 1.0))) insert_query = """ INSERT INTO public.apis_source (uuid, "name", score) VALUES(%s, %s, %s); """ insert_values(insert_query, values)
def get_types_ids(labels): """ Check if there are new types of entities, if there is add it to the entity type table and then return the UUID for all the entities """ ids_str = "', '".join(labels) ids_str = "('{}')".format(ids_str) query = """ select uuid, name from apis_entitytype where name in {} """.format(ids_str) results = connect(query, verbose=False) existing_types = [item[1] for item in results] new_types = set(labels) - set(existing_types) if new_types: logging.info("inserting: {}".format(", ".join(new_types))) insert_query = """ INSERT INTO public.apis_entitytype (uuid, "name") VALUES(%s, %s);""" values = [] for etype in new_types: values.append((str(uuid.uuid4()), etype)) insert_values(insert_query, values) results = connect(query, verbose=False) types = {item[1]: item[0] for item in results} return types
def insert_story_entity_map(values): query = """ INSERT INTO public.apis_storyentitymap (uuid, "entityID_id", "storyID_id", mentions, salience, created_at) VALUES(%s, %s, %s, %s, %s, %s); """ insert_values(query, values)
def insert_story_entity_ref(values): query = """ INSERT INTO public.apis_storyentityref (uuid, "name", "typeID_id", wikipedia, render, created_at) VALUES(%s, %s, %s, %s, %s, %s); """ insert_values(query, values)
def insert_entity_alias(values): query = """ INSERT INTO public.entity_alias (uuid, "name", wikipedia, score, created_at, "parentID_id", "typeID_id") VALUES(%s, %s, %s, %s, %s, %s, %s); """ insert_values(query, values)
def insert_entity_scores(df, bucket_ids, model_uuid): """ Insert entity scores into the db by fetching the uuid of all the entities present in a particular article """ article_uuids = df["uuid"].unique() ids_str = "', '".join(article_uuids) ids_str = "('{}')".format(ids_str) query = """ select "entityID_id", "storyID_id" from apis_storyentitymap as2 where as2."storyID_id" in {} """.format(ids_str) results = connect(query, verbose=False) entity_df = pd.DataFrame(results, columns=["entity_id", "story_id"]) logging.info("{} articles found".format(entity_df["story_id"].nunique())) logging.info("{} entities found".format(entity_df["entity_id"].nunique())) # get all unique entities in the articles entity_df = entity_df.groupby(["story_id" ])["entity_id"].apply(set).reset_index() df = df.merge(entity_df, how="left", left_on="uuid", right_on="story_id") df.drop("story_id", axis=1, inplace=True) df["entity_id"] = df.apply(add_article_entity_to_score, axis=1) values = [] """ for each row for each entities and for each bucket insert the scores """ for _, row in df.iterrows(): for entity in row["entity_id"]: for bucket in bucket_ids.keys(): log_row = (str(uuid.uuid4()), row["uuid"], row[bucket], bucket_ids[bucket], entity, model_uuid, row["sourceUUID"], str(datetime.now())) values.append(log_row) logging.info("writing {} articles into entity scores".format(df.shape[0])) insert_query = """ INSERT INTO public.apis_entityscore (uuid, "storyID_id", "grossScore", "bucketID_id", "entityID_id", "modelID_id", "sourceID_id", "entryTime") VALUES(%s, %s, %s, %s, %s, %s, %s, %s); """ insert_values(insert_query, values)
def insert_clustermap(CLUSTER_MAP): """ Inserting Clustermaps """ logging.info('Inserting {} cluster mappings'.format(len(CLUSTER_MAP))) query = """ INSERT INTO ml_clustermap (uuid,"clusterID_id","storyID_id", published_date) VALUES (%s,%s,%s,%s) ON CONFLICT(uuid) DO UPDATE SET "clusterID_id" = EXCLUDED."clusterID_id" """ insert_values(query, CLUSTER_MAP)
def insert_cluster(new_clusters): """ Inserting with replacement """ logging.info('Inserting {} new clusters'.format(len(new_clusters))) cluster_num = get_max_cluster_num() cluster_ids = [] for uuids in list(np.unique(new_clusters)): cluster_num += 1 cluster_ids.append(tuple([uuids, str(cluster_num)])) query = """ INSERT INTO ml_cluster (uuid,cluster) VALUES(%s,%s) """ insert_values(query, cluster_ids)
def sentiment_from_body(): """ runs sentiment analysis on each article """ # fetch all stories where body exists and we haven't done # sentiment analysis query = """ SELECT story.uuid, body.body FROM public.apis_story story LEFT JOIN (SELECT distinct "storyID_id" FROM public.apis_storysentiment WHERE is_headline=false) sentiment ON story.uuid = sentiment."storyID_id" INNER JOIN (select "storyID_id", (array_agg(body))[1] as body from apis_storybody group by "storyID_id") AS body ON story.uuid = body."storyID_id" WHERE sentiment."storyID_id" IS NULL LIMIT 20000 """ response = connect(query) values = [] logging.info("extracting entities from {} articles".format(len(response))) count = 1 for story_uuid, body in response: sentiment = get_sentiment(body) values.append( (str(uuid.uuid4()), False, json.dumps(sentiment), story_uuid, str(datetime.now()))) if not count % 100: logging.info("processed: {}".format(count)) count += 1 insert_query = """ INSERT INTO public.apis_storysentiment (uuid, is_headline, sentiment, "storyID_id", "entryTime") VALUES(%s, %s, %s, %s, %s); """ insert_values(insert_query, values) logging.info("finished")
def insert_bucket_scores(df, bucket_ids, model_uuid): """ Insert bucket scores into the db """ values = [] for _, row in df.iterrows(): for bucket in bucket_ids.keys(): log_row = (str(uuid.uuid4()), row["uuid"], str(datetime.now()), row[bucket], bucket_ids[bucket], model_uuid, row["sourceUUID"], row["published_date"]) values.append(log_row) logging.info("writing {} articles into bucket scores".format(df.shape[0])) insert_query = """ INSERT INTO public.apis_bucketscore (uuid, "storyID_id", "entryTime", "grossScore", "bucketID_id", "modelID_id", "sourceID_id", "storyDate") VALUES(%s, %s, %s, %s, %s, %s, %s, %s); """ insert_values(insert_query, values)
def sentiment_on_headlines(): """ runs sentiment analysis on each article """ # fetch titles of all stories we haven't done # sentiment analysis query = """ SELECT story.uuid, title FROM public.apis_story story LEFT JOIN (SELECT distinct "storyID_id" FROM public.apis_storysentiment WHERE is_headline=true) sentiment ON story.uuid = sentiment."storyID_id" WHERE sentiment."storyID_id" IS NULL LIMIT 20000 """ response = connect(query) values = [] logging.info("extracting entities from {} articles".format(len(response))) count = 1 for story_uuid, headline in response: sentiment = get_sentiment(headline) values.append( (str(uuid.uuid4()), True, json.dumps(sentiment), story_uuid, str(datetime.now()))) if not count % 100: logging.info("processed: {}".format(count)) count += 1 insert_query = """ INSERT INTO public.apis_storysentiment (uuid, is_headline, sentiment, "storyID_id", "entryTime") VALUES(%s, %s, %s, %s, %s); """ insert_values(insert_query, values) logging.info("finished")
def publish_to_source(**kwargs): """ publishes companies to scrape to the pubsub so news aggregator may process the data """ SOURCE_UUID = kwargs["source_uuid"] SOURCE = kwargs["source"] # load companies which were added to be tracked query = """ select entity.uuid, entity.name, keywords, "scenarioID_id", scenario."trackingDays" from public.apis_entity entity left join public.apis_scenario scenario on entity."scenarioID_id" = scenario.uuid where "entryVerified"=true; """ results = connect(query, verbose=False) df = pd.DataFrame(results, columns=[ "entity_id", "name", "keywords", "scenario_id", "trackingDays" ]) df = df.apply(lambda x: get_last_tracked(x, SOURCE), axis=1) items_to_insert = [] for _, row in df.iterrows(): params = {} params["entity_id"] = row["entity_id"] params["entity_name"] = row["name"] params["scenario_id"] = row["scenario_id"] params["source"] = [SOURCE] params["storage_bucket"] = BUCKET_NAME params["history_processed"] = row["history_processed"] if not row["history_processed"]: # date from date_from = datetime.now() - \ timedelta(days=int(row["trackingDays"])) # date to date_to = datetime.now() while True: date_runner = date_from + timedelta(days=30) params["date_from"] = datetime.strftime(date_from, DATE_FORMAT) params["date_to"] = datetime.strftime(date_runner, DATE_FORMAT) date_from = date_runner if date_runner >= date_to: date_to = date_runner - timedelta(days=30) break for i in range(0, len(row['keywords']), 2): params["common_names"] = row["keywords"][i:i + 2] success = publish(params) else: # date from date_from = row["last_tracked"] # date to date_to = datetime.now() date_to_write = datetime.strftime(date_to, DATE_FORMAT) date_from_write = datetime.strftime(date_from, DATE_FORMAT) params["date_from"] = date_from_write params["date_to"] = date_to_write for i in range(0, len(row['keywords']), 5): params["common_names"] = row["keywords"][i:i + 5] success = publish(params) logging.info("{} - {}".format(row["name"], date_to)) # if succeeded in publishing update company status & date if success: items_to_insert.append(( str(uuid.uuid4()), str(date_to), params["entity_id"], SOURCE_UUID, )) insert_values(INSERT_QUERY, items_to_insert)