def get_types_ids(labels): """ Check if there are new types of entities, if there is add it to the entity type table and then return the UUID for all the entities """ ids_str = "', '".join(labels) ids_str = "('{}')".format(ids_str) query = """ select uuid, name from apis_entitytype where name in {} """.format(ids_str) results = connect(query, verbose=False) existing_types = [item[1] for item in results] new_types = set(labels) - set(existing_types) if new_types: logging.info("inserting: {}".format(", ".join(new_types))) insert_query = """ INSERT INTO public.apis_entitytype (uuid, "name") VALUES(%s, %s);""" values = [] for etype in new_types: values.append((str(uuid.uuid4()), etype)) insert_values(insert_query, values) results = connect(query, verbose=False) types = {item[1]: item[0] for item in results} return types
def match_manual_entity_to_story(df): """ Create a manual check of whether our pre-determined entities are in the story and create a link. """ query = """ select entity.uuid, entity.name from apis_entity ae inner join apis_storyentityref entity on ae.uuid = entity.uuid """ results = connect(query, verbose=False) df["text"] = df["title"] + df["body"] df["text"] = df["text"].str.lower() story_map_inputs = [] for _, row in df.iterrows(): for entity in results: if entity[1].lower() in str(row["text"]): data = (str(uuid.uuid4()), entity[0], row["uuid"], 1, .5, str(datetime.utcnow())) story_map_inputs.append(data) logging.info("{} manual relations found".format(len(story_map_inputs))) insert_story_entity_map(story_map_inputs)
def get_last_tracked(row, source): # query which we'll use to find out if we've tracked the particular entity # if we're tracking, it return the last tracked date for the particular # scrape source if it was scraped query = """ select max(lastScraped) as lastScrape from (select ls."lastScraped" as lastScraped, ss.name, ls."entityID_id" from (public.apis_lastscrape as ls left join public.apis_scrapesource as ss on ls."scrapeSourceID_id"=ss.uuid) where name = '{}' and ls."entityID_id" = '{}') fp """.format(source, row["entity_id"]) results = connect(query, verbose=False) row["last_tracked"] = results[0][0] if not results[0][0]: row["history_processed"] = False else: row["history_processed"] = True return row
def index_source(): """ index new domains into our source table """ query = """ select as2.domain from (select distinct "domain" from apis_story) as2 left join apis_source src on as2."domain" = src."name" where src."name" is null """ new_sources = connect(query) values = [] logging.info("{} new domains".format(len(new_sources))) for source in new_sources: values.append(((str(uuid.uuid4()), source[0], 1.0))) insert_query = """ INSERT INTO public.apis_source (uuid, "name", score) VALUES(%s, %s, %s); """ insert_values(insert_query, values)
def get_scenario_articles(model_uuid, scenario, body=False, article_count=5000): """ Fetch articles which we haven't scored using our current model yet which belongs to our risk scenario Articles filtered based on the following criteria: * No score in bucketscores * Source exists * No relations in entitymap, hence entity extraction not done * EntityID in entityref """ if body: query = """ select story.uuid, title, body, published_date, src.uuid as sourceUUID, "entityID_id" as entityUUID from public.apis_story story left join (SELECT distinct "storyID_id" FROM public.apis_bucketscore where "modelID_id" = '{}') ab on story.uuid = ab."storyID_id" inner join (select "storyID_id", (array_agg(body))[1] as body from apis_storybody group by "storyID_id") story_body on story_body."storyID_id" = story.uuid inner join (select distinct "storyID_id" from apis_storyentitymap) entitymap on story.uuid = entitymap."storyID_id" inner join public.apis_source as src on src."name" = story."domain" left join public.apis_scenario as scnr on scnr.uuid = story."scenarioID_id" where scnr."name" = '{}' and ab."storyID_id" is null and "entityID_id" in (select uuid from apis_storyentityref story) limit {} """.format(model_uuid, scenario, article_count) else: query = """ select story.uuid, title, published_date, src.uuid as sourceUUID, "entityID_id" as entityUUID from public.apis_story story left join (SELECT distinct "storyID_id" FROM public.apis_bucketscore where "modelID_id" = '{}') ab on story.uuid = ab."storyID_id" inner join (select distinct "storyID_id" from apis_storyentitymap) entitymap on story.uuid = entitymap."storyID_id" inner join public.apis_source as src on src."name" = story."domain" left join public.apis_scenario as scnr on scnr.uuid = story."scenarioID_id" where scnr."name" = '{}' and ab."storyID_id" is null and "entityID_id" in (select uuid from apis_storyentityref story) limit {} """.format(model_uuid, scenario, article_count) articles = connect(query) return articles
def fuzzy_matching(): """ Score * -1: Matched using Wikipedia * -2: Should be parent - create parent and corresponding Alias * 0 - 100: Match found using Fuzzy """ results = connect("""select alias.name, wikipedia, "parentID_id", enttype."name" from entity_alias alias inner join apis_entitytype enttype on alias."typeID_id" = enttype.uuid""") alias = pd.DataFrame(results, columns=['name', 'wikipedia', 'parent', 'type']) logging.info("Starting Fuzzy matching.") merged_df = pd.read_csv("merged_df.csv") merged_df['score'] = np.nan merged_df = process_fuzzy(merged_df, alias) logging.info("Found {} parents.".format( merged_df[merged_df["score"] == -2].shape[0])) logging.info("Matched {} with wikilinks.".format( merged_df[merged_df["score"] == -1].shape[0])) logging.info("Matched {} with fuzzymatching.".format( merged_df[merged_df["score"] > -1].shape[0])) merged_df.to_csv("merged_fuzzy_df.csv", index=False)
def get_articles(): """ Fetch all stories where body exists and we haven't done Entity Recognition from active Scenarios """ query = """ SELECT story.uuid, story.title, body.body, story."scenarioID_id", published_date FROM public.apis_story story LEFT JOIN (SELECT distinct "storyID_id" FROM public.apis_storyentitymap) entity ON story.uuid = entity."storyID_id" INNER JOIN (select "storyID_id", (array_agg(body))[1] as body from apis_storybody where status_code=200 group by "storyID_id") AS body ON story.uuid = body."storyID_id" WHERE entity."storyID_id" IS null AND "language" in ('english', 'US', 'CA', 'AU', 'IE') AND "scenarioID_id" in (SELECT uuid FROM apis_scenario as2 WHERE status = 'active') LIMIT 1000 """ response = connect(query) df = pd.DataFrame( response, columns=["uuid", "title", "body", "scenario_id", "published_date"]) return df
def insert_entity_scores(df, bucket_ids, model_uuid): """ Insert entity scores into the db by fetching the uuid of all the entities present in a particular article """ article_uuids = df["uuid"].unique() ids_str = "', '".join(article_uuids) ids_str = "('{}')".format(ids_str) query = """ select "entityID_id", "storyID_id" from apis_storyentitymap as2 where as2."storyID_id" in {} """.format(ids_str) results = connect(query, verbose=False) entity_df = pd.DataFrame(results, columns=["entity_id", "story_id"]) logging.info("{} articles found".format(entity_df["story_id"].nunique())) logging.info("{} entities found".format(entity_df["entity_id"].nunique())) # get all unique entities in the articles entity_df = entity_df.groupby(["story_id" ])["entity_id"].apply(set).reset_index() df = df.merge(entity_df, how="left", left_on="uuid", right_on="story_id") df.drop("story_id", axis=1, inplace=True) df["entity_id"] = df.apply(add_article_entity_to_score, axis=1) values = [] """ for each row for each entities and for each bucket insert the scores """ for _, row in df.iterrows(): for entity in row["entity_id"]: for bucket in bucket_ids.keys(): log_row = (str(uuid.uuid4()), row["uuid"], row[bucket], bucket_ids[bucket], entity, model_uuid, row["sourceUUID"], str(datetime.now())) values.append(log_row) logging.info("writing {} articles into entity scores".format(df.shape[0])) insert_query = """ INSERT INTO public.apis_entityscore (uuid, "storyID_id", "grossScore", "bucketID_id", "entityID_id", "modelID_id", "sourceID_id", "entryTime") VALUES(%s, %s, %s, %s, %s, %s, %s, %s); """ insert_values(insert_query, values)
def insert_entity_into_entityref(): """ Fetch all entities that are not in entity_ref and input it into entity_ref. * If entity exists in api_entity table but not in api_story_entity_ref table with same UUID, add it to apis_storyentityref save all entity uuid, new and old to merged_df * Check if the alias exists for the entity, otherwise input the values into alias too """ entity_ref_query = """ select entity.uuid, entity.name as legal_name, entity."typeID_id", "wikipedia", true, created_at from apis_entity entity where uuid not in (select ae.uuid from apis_entity ae inner join apis_storyentityref ar on ae.uuid = ar.uuid) and entity."entryVerified"=true """ entity_ref_results = connect(entity_ref_query, verbose=False) logging.info("{} entities to insert to entityref".format( len(entity_ref_results))) insert_story_entity_ref(entity_ref_results) alias_query = """ select uuid_generate_v4(), entity.name, "wikipedia", -3, now()::text, entity.uuid, entity."typeID_id" from apis_entity entity where uuid not in (select ae.uuid from apis_entity ae inner join entity_alias ar on ae.uuid = ar."parentID_id") and entity."entryVerified"=true """ alias_results = connect(alias_query, verbose=False) logging.info("{} entities to insert to entity_alias".format( len(alias_results))) insert_entity_alias(alias_results)
def get_entities(): # Fetch all entity aliases that we have in our storage query = """ select entity.uuid, entity.name as legal_name from public.apis_entity entity where "manualEntry"=true and "entryVerified"=true; """ results = connect(query, verbose=False) insert_entity_into_entityref() entity_df = pd.DataFrame(results, columns=["entity_id", "legal_name"]) return entity_df
def get_model_details(scenario): """ get the bucket and model details """ model_query = """ select am.uuid, bucket, storage_link, am."name" from apis_modeldetail am left join apis_scenario scr on am."scenarioID_id" = scr.uuid where scr."name" = '{}' and "version" = (select max("version") from apis_modeldetail am left join apis_scenario scr on am."scenarioID_id" = scr.uuid where scr."name" = '{}') """.format(scenario, scenario) results = connect(model_query) logging.info(results) return results
def sentiment_from_body(): """ runs sentiment analysis on each article """ # fetch all stories where body exists and we haven't done # sentiment analysis query = """ SELECT story.uuid, body.body FROM public.apis_story story LEFT JOIN (SELECT distinct "storyID_id" FROM public.apis_storysentiment WHERE is_headline=false) sentiment ON story.uuid = sentiment."storyID_id" INNER JOIN (select "storyID_id", (array_agg(body))[1] as body from apis_storybody group by "storyID_id") AS body ON story.uuid = body."storyID_id" WHERE sentiment."storyID_id" IS NULL LIMIT 20000 """ response = connect(query) values = [] logging.info("extracting entities from {} articles".format(len(response))) count = 1 for story_uuid, body in response: sentiment = get_sentiment(body) values.append( (str(uuid.uuid4()), False, json.dumps(sentiment), story_uuid, str(datetime.now()))) if not count % 100: logging.info("processed: {}".format(count)) count += 1 insert_query = """ INSERT INTO public.apis_storysentiment (uuid, is_headline, sentiment, "storyID_id", "entryTime") VALUES(%s, %s, %s, %s, %s); """ insert_values(insert_query, values) logging.info("finished")
def sentiment_on_headlines(): """ runs sentiment analysis on each article """ # fetch titles of all stories we haven't done # sentiment analysis query = """ SELECT story.uuid, title FROM public.apis_story story LEFT JOIN (SELECT distinct "storyID_id" FROM public.apis_storysentiment WHERE is_headline=true) sentiment ON story.uuid = sentiment."storyID_id" WHERE sentiment."storyID_id" IS NULL LIMIT 20000 """ response = connect(query) values = [] logging.info("extracting entities from {} articles".format(len(response))) count = 1 for story_uuid, headline in response: sentiment = get_sentiment(headline) values.append( (str(uuid.uuid4()), True, json.dumps(sentiment), story_uuid, str(datetime.now()))) if not count % 100: logging.info("processed: {}".format(count)) count += 1 insert_query = """ INSERT INTO public.apis_storysentiment (uuid, is_headline, sentiment, "storyID_id", "entryTime") VALUES(%s, %s, %s, %s, %s); """ insert_values(insert_query, values) logging.info("finished")
def fetch_scenarios(): """ Publish message to generate feed """ query = """ select uuid, name, mode from apis_scenario where status = 'active' """ data = connect(query) for scenario in data: message = json.dumps( {"scenario": scenario[0], "mode": scenario[2]}) logging.info("publishing {}".format(scenario[1])) publish(message) if scenario[2] == 'auto': message = json.dumps( {"scenario": scenario[0], "mode": "portfolio"}) publish(message)
def get_bucket_ids(scenario): """ Connect predictions to bucket fetch UUIDs and connect to prediction """ query = """ select ab.uuid, model_label, keywords from apis_bucket ab left join apis_scenario scr on ab."scenarioID_id" = scr.uuid where scr."name" = '{}' """.format(scenario) results = connect(query) bucket_ids = {} for result in results: bucket_ids[result[1]] = result[0] bucket_keywords = {} for result in results: bucket_keywords[result[1]] = result[2] return bucket_ids, bucket_keywords
def extract_entities(): """ extracts entities using spaCy from the body of the article """ df = get_articles() values = [] values_custom = [] dic_ref = fetch_custom_entities() logging.info("extracting entities from {} articles".format(len(df))) client = language_v1.LanguageServiceClient() limit = 500 starttime = time.time() start = timer() for i, row in df.iterrows(): # only process 500 articles per minute if not ((i + 1) % limit): sleeptime = starttime + 60 - time.time() if sleeptime > 0: time.sleep(sleeptime) starttime = time.time() text = "{}. {}".format(row["title"], row["body"])[:999] if isEnglish(text): entities = analyze_entities(client, row["uuid"], row["published_date"], row["scenario_id"], text) values += entities entities = custom_entity_extraction(row["uuid"], row["published_date"], row["scenario_id"], text, dic_ref) values_custom += entities if not i % 100: logging.info("processed: {}".format(i)) end = timer() logging.info("time elapsed: {}".format(end - start)) story_entity_df = pd.DataFrame(values, columns=[ "story_uuid", "text", "label", "salience", "published_date", "scenario_id", "wiki", "mentions" ]) story_entity_custom_df = pd.DataFrame(values_custom, columns=[ "story_uuid", "text", "label", "salience", "published_date", "scenario_id", "wiki", "mentions" ]) # remove conflicting samples from Google EE and PP story_entity_df = story_entity_custom_df.append( story_entity_df).drop_duplicates(['story_uuid', 'text']).reset_index(drop=True) # set character length of 196 story_entity_df["text"] = story_entity_df["text"].str.slice(0, 196) dump_into_entity(story_entity_df.copy()) story_entity_df.drop(["published_date", "scenario_id"], axis=1, inplace=True) # fetch and add existing entities in api_entity entity_df = get_entities() # unique values by using combination of article uuid and the text merged_df = pd.merge(story_entity_df, entity_df, how='left', left_on="text", right_on="legal_name") # if it doesn't exists in apis_entity table, and is new generate new uuid # and add new entities to apis_storyentityref check_label_in_story_ref = set( merged_df[merged_df.isna().any(axis=1)]["text"]) ids_str = "', '".join(check_label_in_story_ref) ids_str = "('{}')".format(ids_str) query = """ select "parentID_id", name from entity_alias where name in {} """.format(ids_str) # fetch uuid of existing items and new items and add to merged_df # if exists in apis_story_ref, just add ref in map table results = connect(query, verbose=False) logging.info("{}/{} existing entity_alias found".format( len(results), len(check_label_in_story_ref))) story_entity_ref_df = pd.DataFrame( results, columns=["entity_ref_id", "entity_name"]) # drop duplicates story_entity_ref_df = story_entity_ref_df.drop_duplicates( subset='entity_name', keep="first") merged_df = pd.merge(merged_df, story_entity_ref_df, how='left', left_on="text", right_on="entity_name") merged_df["wiki"].fillna("", inplace=True) merged_df.to_csv("merged_df.csv", index=False) df.to_csv("df.csv", index=False) story_entity_df.to_csv("story_entity_df.csv", index=False)
def publish_to_source(**kwargs): """ publishes companies to scrape to the pubsub so news aggregator may process the data """ SOURCE_UUID = kwargs["source_uuid"] SOURCE = kwargs["source"] # load companies which were added to be tracked query = """ select entity.uuid, entity.name, keywords, "scenarioID_id", scenario."trackingDays" from public.apis_entity entity left join public.apis_scenario scenario on entity."scenarioID_id" = scenario.uuid where "entryVerified"=true; """ results = connect(query, verbose=False) df = pd.DataFrame(results, columns=[ "entity_id", "name", "keywords", "scenario_id", "trackingDays" ]) df = df.apply(lambda x: get_last_tracked(x, SOURCE), axis=1) items_to_insert = [] for _, row in df.iterrows(): params = {} params["entity_id"] = row["entity_id"] params["entity_name"] = row["name"] params["scenario_id"] = row["scenario_id"] params["source"] = [SOURCE] params["storage_bucket"] = BUCKET_NAME params["history_processed"] = row["history_processed"] if not row["history_processed"]: # date from date_from = datetime.now() - \ timedelta(days=int(row["trackingDays"])) # date to date_to = datetime.now() while True: date_runner = date_from + timedelta(days=30) params["date_from"] = datetime.strftime(date_from, DATE_FORMAT) params["date_to"] = datetime.strftime(date_runner, DATE_FORMAT) date_from = date_runner if date_runner >= date_to: date_to = date_runner - timedelta(days=30) break for i in range(0, len(row['keywords']), 2): params["common_names"] = row["keywords"][i:i + 2] success = publish(params) else: # date from date_from = row["last_tracked"] # date to date_to = datetime.now() date_to_write = datetime.strftime(date_to, DATE_FORMAT) date_from_write = datetime.strftime(date_from, DATE_FORMAT) params["date_from"] = date_from_write params["date_to"] = date_to_write for i in range(0, len(row['keywords']), 5): params["common_names"] = row["keywords"][i:i + 5] success = publish(params) logging.info("{} - {}".format(row["name"], date_to)) # if succeeded in publishing update company status & date if success: items_to_insert.append(( str(uuid.uuid4()), str(date_to), params["entity_id"], SOURCE_UUID, )) insert_values(INSERT_QUERY, items_to_insert)