Ejemplo n.º 1
0
def get_types_ids(labels):
    """
    Check if there are new types of entities, if there is
    add it to the entity type table and then return the
    UUID for all the entities
    """
    ids_str = "', '".join(labels)
    ids_str = "('{}')".format(ids_str)

    query = """
            select uuid, name from apis_entitytype
            where name in {}
            """.format(ids_str)

    results = connect(query, verbose=False)
    existing_types = [item[1] for item in results]

    new_types = set(labels) - set(existing_types)

    if new_types:
        logging.info("inserting: {}".format(", ".join(new_types)))
        insert_query = """
                    INSERT INTO public.apis_entitytype
                    (uuid, "name") VALUES(%s, %s);"""

        values = []
        for etype in new_types:
            values.append((str(uuid.uuid4()), etype))

        insert_values(insert_query, values)

    results = connect(query, verbose=False)
    types = {item[1]: item[0] for item in results}

    return types
Ejemplo n.º 2
0
def match_manual_entity_to_story(df):
    """
    Create a manual check of whether our pre-determined
    entities are in the story and create a link.
    """
    query = """
            select entity.uuid, entity.name
            from apis_entity ae
            inner join apis_storyentityref entity
            on ae.uuid = entity.uuid
            """
    results = connect(query, verbose=False)
    df["text"] = df["title"] + df["body"]
    df["text"] = df["text"].str.lower()

    story_map_inputs = []

    for _, row in df.iterrows():
        for entity in results:
            if entity[1].lower() in str(row["text"]):
                data = (str(uuid.uuid4()), entity[0], row["uuid"], 1, .5,
                        str(datetime.utcnow()))
                story_map_inputs.append(data)

    logging.info("{} manual relations found".format(len(story_map_inputs)))
    insert_story_entity_map(story_map_inputs)
Ejemplo n.º 3
0
def get_last_tracked(row, source):

    # query which we'll use to find out if we've tracked the particular entity
    # if we're tracking, it return the last tracked date for the particular
    # scrape source if it was scraped

    query = """
            select max(lastScraped) as lastScrape from
            (select ls."lastScraped" as lastScraped, ss.name, ls."entityID_id" from
            (public.apis_lastscrape as ls left join
            public.apis_scrapesource as ss
            on ls."scrapeSourceID_id"=ss.uuid)
            where name = '{}' and
            ls."entityID_id" = '{}') fp
            """.format(source, row["entity_id"])

    results = connect(query, verbose=False)

    row["last_tracked"] = results[0][0]
    if not results[0][0]:
        row["history_processed"] = False
    else:
        row["history_processed"] = True

    return row
Ejemplo n.º 4
0
def index_source():
    """
    index new domains into our source table
    """

    query = """
    select as2.domain from
    (select distinct "domain" from apis_story) as2
    left join
    apis_source src
    on as2."domain" = src."name"
    where src."name" is null
    """

    new_sources = connect(query)
    values = []
    logging.info("{} new domains".format(len(new_sources)))
    for source in new_sources:
        values.append(((str(uuid.uuid4()), source[0], 1.0)))

    insert_query = """
                    INSERT INTO public.apis_source
                    (uuid, "name", score)
                    VALUES(%s, %s, %s);
                   """

    insert_values(insert_query, values)
Ejemplo n.º 5
0
def get_scenario_articles(model_uuid,
                          scenario,
                          body=False,
                          article_count=5000):
    """
    Fetch articles which we haven't scored
    using our current model yet which belongs
    to our risk scenario

    Articles filtered based on the following criteria:
    * No score in bucketscores
    * Source exists
    * No relations in entitymap, hence entity extraction not done
    * EntityID in entityref
    """

    if body:
        query = """
                select story.uuid, title, body, published_date, src.uuid as sourceUUID,
                "entityID_id" as entityUUID from public.apis_story story
                left join
                (SELECT distinct "storyID_id" FROM public.apis_bucketscore
                where "modelID_id" = '{}') ab on story.uuid = ab."storyID_id"
                inner join (select "storyID_id", (array_agg(body))[1] as body
                from apis_storybody group by "storyID_id") story_body
                on story_body."storyID_id" = story.uuid
                inner join
                (select distinct "storyID_id" from apis_storyentitymap) entitymap
                on story.uuid = entitymap."storyID_id"
                inner join
                public.apis_source as src on src."name" = story."domain"
                left join
                public.apis_scenario as scnr on scnr.uuid = story."scenarioID_id"
                where scnr."name" = '{}'
                and ab."storyID_id" is null
                and "entityID_id" in (select uuid from apis_storyentityref story)
                limit {}
                """.format(model_uuid, scenario, article_count)
    else:
        query = """
                select story.uuid, title, published_date, src.uuid as sourceUUID,
                "entityID_id" as entityUUID from public.apis_story story
                left join
                (SELECT distinct "storyID_id" FROM public.apis_bucketscore
                where "modelID_id" = '{}') ab on story.uuid = ab."storyID_id"
                inner join
                (select distinct "storyID_id" from apis_storyentitymap) entitymap
                on story.uuid = entitymap."storyID_id"
                inner join
                public.apis_source as src on src."name" = story."domain"
                left join
                public.apis_scenario as scnr on scnr.uuid = story."scenarioID_id"
                where scnr."name" = '{}'
                and ab."storyID_id" is null
                and "entityID_id" in (select uuid from apis_storyentityref story)
                limit {}
                """.format(model_uuid, scenario, article_count)

    articles = connect(query)
    return articles
Ejemplo n.º 6
0
def fuzzy_matching():
    """
    Score
    * -1: Matched using Wikipedia
    * -2: Should be parent - create parent and corresponding Alias
    * 0 - 100: Match found using Fuzzy
    """
    results = connect("""select alias.name, wikipedia, "parentID_id",
                         enttype."name" from entity_alias alias
                         inner join apis_entitytype enttype on
                         alias."typeID_id" = enttype.uuid""")

    alias = pd.DataFrame(results,
                         columns=['name', 'wikipedia', 'parent', 'type'])

    logging.info("Starting Fuzzy matching.")

    merged_df = pd.read_csv("merged_df.csv")

    merged_df['score'] = np.nan

    merged_df = process_fuzzy(merged_df, alias)

    logging.info("Found {} parents.".format(
        merged_df[merged_df["score"] == -2].shape[0]))

    logging.info("Matched {} with wikilinks.".format(
        merged_df[merged_df["score"] == -1].shape[0]))

    logging.info("Matched {} with fuzzymatching.".format(
        merged_df[merged_df["score"] > -1].shape[0]))

    merged_df.to_csv("merged_fuzzy_df.csv", index=False)
Ejemplo n.º 7
0
def get_articles():
    """
    Fetch all stories where body exists and we haven't done
    Entity Recognition from active Scenarios
    """
    query = """
                SELECT story.uuid, story.title, body.body, story."scenarioID_id",
                published_date FROM public.apis_story story
                LEFT JOIN
                (SELECT distinct "storyID_id" FROM public.apis_storyentitymap) entity
                ON story.uuid = entity."storyID_id"
                INNER JOIN (select "storyID_id", (array_agg(body))[1] as body
                from apis_storybody where status_code=200 group by "storyID_id") AS body
                ON story.uuid = body."storyID_id"
                WHERE entity."storyID_id" IS null
                AND "language" in ('english', 'US', 'CA', 'AU', 'IE')
                AND "scenarioID_id" in (SELECT uuid FROM apis_scenario as2 WHERE status = 'active')
                LIMIT 1000
            """

    response = connect(query)

    df = pd.DataFrame(
        response,
        columns=["uuid", "title", "body", "scenario_id", "published_date"])
    return df
Ejemplo n.º 8
0
def insert_entity_scores(df, bucket_ids, model_uuid):
    """
    Insert entity scores into the db by fetching the uuid
    of all the entities present in a particular article
    """

    article_uuids = df["uuid"].unique()

    ids_str = "', '".join(article_uuids)
    ids_str = "('{}')".format(ids_str)

    query = """
    select "entityID_id", "storyID_id" from apis_storyentitymap as2
    where as2."storyID_id" in {}
    """.format(ids_str)

    results = connect(query, verbose=False)

    entity_df = pd.DataFrame(results, columns=["entity_id", "story_id"])

    logging.info("{} articles found".format(entity_df["story_id"].nunique()))
    logging.info("{} entities found".format(entity_df["entity_id"].nunique()))

    # get all unique entities in the articles
    entity_df = entity_df.groupby(["story_id"
                                   ])["entity_id"].apply(set).reset_index()

    df = df.merge(entity_df, how="left", left_on="uuid", right_on="story_id")
    df.drop("story_id", axis=1, inplace=True)

    df["entity_id"] = df.apply(add_article_entity_to_score, axis=1)

    values = []
    """
    for each row
    for each entities
    and for each bucket
    insert the scores
    """
    for _, row in df.iterrows():
        for entity in row["entity_id"]:
            for bucket in bucket_ids.keys():
                log_row = (str(uuid.uuid4()), row["uuid"], row[bucket],
                           bucket_ids[bucket], entity, model_uuid,
                           row["sourceUUID"], str(datetime.now()))
                values.append(log_row)

    logging.info("writing {} articles into entity scores".format(df.shape[0]))
    insert_query = """
        INSERT INTO public.apis_entityscore
        (uuid, "storyID_id", "grossScore", "bucketID_id",
        "entityID_id", "modelID_id", "sourceID_id", "entryTime")
        VALUES(%s, %s, %s, %s, %s, %s, %s, %s);
    """

    insert_values(insert_query, values)
Ejemplo n.º 9
0
def insert_entity_into_entityref():
    """
    Fetch all entities that are not in entity_ref and input it into entity_ref.

    * If entity exists in api_entity table but not in api_story_entity_ref table
      with same UUID, add it to apis_storyentityref save all entity uuid,
      new and old to merged_df
    * Check if the alias exists for the entity, otherwise input the values
      into alias too
    """
    entity_ref_query = """
                       select entity.uuid, entity.name as legal_name,
                       entity."typeID_id", "wikipedia", true, created_at
                       from apis_entity entity where uuid not in
                       (select ae.uuid from apis_entity ae
                       inner join apis_storyentityref ar
                       on ae.uuid = ar.uuid)
                       and entity."entryVerified"=true
                       """

    entity_ref_results = connect(entity_ref_query, verbose=False)
    logging.info("{} entities to insert to entityref".format(
        len(entity_ref_results)))
    insert_story_entity_ref(entity_ref_results)

    alias_query = """
                  select uuid_generate_v4(), entity.name, "wikipedia", -3,
                  now()::text, entity.uuid, entity."typeID_id"
                  from apis_entity entity where uuid not in
                  (select ae.uuid from apis_entity ae
                  inner join entity_alias ar
                  on ae.uuid = ar."parentID_id")
                  and entity."entryVerified"=true
                  """

    alias_results = connect(alias_query, verbose=False)
    logging.info("{} entities to insert to entity_alias".format(
        len(alias_results)))

    insert_entity_alias(alias_results)
Ejemplo n.º 10
0
def get_entities():

    # Fetch all entity aliases that we have in our storage
    query = """
        select entity.uuid, entity.name as legal_name from
        public.apis_entity entity
        where "manualEntry"=true
        and "entryVerified"=true;
        """

    results = connect(query, verbose=False)

    insert_entity_into_entityref()

    entity_df = pd.DataFrame(results, columns=["entity_id", "legal_name"])
    return entity_df
Ejemplo n.º 11
0
def get_model_details(scenario):
    """
    get the bucket and model details
    """

    model_query = """
        select am.uuid, bucket, storage_link, am."name" from apis_modeldetail am
        left join
        apis_scenario scr on am."scenarioID_id" = scr.uuid
        where scr."name" = '{}' and
        "version" = (select max("version") from apis_modeldetail am
        left join
        apis_scenario scr on am."scenarioID_id" = scr.uuid
        where scr."name" = '{}')
        """.format(scenario, scenario)

    results = connect(model_query)
    logging.info(results)
    return results
Ejemplo n.º 12
0
def sentiment_from_body():
    """
    runs sentiment analysis on each article
    """

    # fetch all stories where body exists and we haven't done
    # sentiment analysis
    query = """
                SELECT story.uuid, body.body FROM
                public.apis_story story
                LEFT JOIN
                (SELECT distinct "storyID_id" FROM public.apis_storysentiment
                WHERE is_headline=false) sentiment
                ON story.uuid = sentiment."storyID_id"
                INNER JOIN (select "storyID_id", (array_agg(body))[1] as body
                from apis_storybody group by "storyID_id") AS body
                ON story.uuid = body."storyID_id"
                WHERE sentiment."storyID_id" IS NULL
                LIMIT 20000
            """

    response = connect(query)
    values = []
    logging.info("extracting entities from {} articles".format(len(response)))

    count = 1
    for story_uuid, body in response:
        sentiment = get_sentiment(body)
        values.append(
            (str(uuid.uuid4()), False, json.dumps(sentiment), story_uuid,
             str(datetime.now())))
        if not count % 100:
            logging.info("processed: {}".format(count))
        count += 1

    insert_query = """
                      INSERT INTO public.apis_storysentiment
                      (uuid, is_headline, sentiment, "storyID_id", "entryTime")
                      VALUES(%s, %s, %s, %s, %s);
                   """

    insert_values(insert_query, values)
    logging.info("finished")
Ejemplo n.º 13
0
def sentiment_on_headlines():
    """
    runs sentiment analysis on each article
    """

    # fetch titles of all stories we haven't done
    # sentiment analysis
    query = """
               SELECT story.uuid, title FROM
               public.apis_story story
               LEFT JOIN
               (SELECT distinct "storyID_id" FROM public.apis_storysentiment
               WHERE is_headline=true) sentiment
               ON story.uuid = sentiment."storyID_id"
               WHERE sentiment."storyID_id" IS NULL
               LIMIT 20000
            """

    response = connect(query)
    values = []
    logging.info("extracting entities from {} articles".format(len(response)))

    count = 1
    for story_uuid, headline in response:
        sentiment = get_sentiment(headline)
        values.append(
            (str(uuid.uuid4()), True, json.dumps(sentiment), story_uuid,
             str(datetime.now())))
        if not count % 100:
            logging.info("processed: {}".format(count))
        count += 1

    insert_query = """
                      INSERT INTO public.apis_storysentiment
                      (uuid, is_headline, sentiment, "storyID_id", "entryTime")
                      VALUES(%s, %s, %s, %s, %s);
                   """

    insert_values(insert_query, values)
    logging.info("finished")
Ejemplo n.º 14
0
def fetch_scenarios():
    """
    Publish message to generate feed
    """

    query = """
            select uuid, name, mode from apis_scenario
            where status = 'active'
            """
    data = connect(query)

    for scenario in data:
        message = json.dumps(
            {"scenario": scenario[0], "mode": scenario[2]})

        logging.info("publishing {}".format(scenario[1]))

        publish(message)
        if scenario[2] == 'auto':
            message = json.dumps(
                {"scenario": scenario[0], "mode": "portfolio"})
            publish(message)
Ejemplo n.º 15
0
def get_bucket_ids(scenario):
    """
    Connect predictions to bucket
    fetch UUIDs and connect to prediction
    """

    query = """
    select ab.uuid, model_label, keywords from apis_bucket ab
    left join apis_scenario scr on ab."scenarioID_id" = scr.uuid
    where scr."name" = '{}'
    """.format(scenario)

    results = connect(query)

    bucket_ids = {}
    for result in results:
        bucket_ids[result[1]] = result[0]

    bucket_keywords = {}
    for result in results:
        bucket_keywords[result[1]] = result[2]

    return bucket_ids, bucket_keywords
Ejemplo n.º 16
0
def extract_entities():
    """
    extracts entities using spaCy from the body
    of the article
    """

    df = get_articles()

    values = []

    values_custom = []

    dic_ref = fetch_custom_entities()

    logging.info("extracting entities from {} articles".format(len(df)))

    client = language_v1.LanguageServiceClient()

    limit = 500
    starttime = time.time()

    start = timer()
    for i, row in df.iterrows():
        # only process 500 articles per minute
        if not ((i + 1) % limit):
            sleeptime = starttime + 60 - time.time()
            if sleeptime > 0:
                time.sleep(sleeptime)
            starttime = time.time()

        text = "{}. {}".format(row["title"], row["body"])[:999]
        if isEnglish(text):
            entities = analyze_entities(client, row["uuid"],
                                        row["published_date"],
                                        row["scenario_id"], text)
            values += entities

            entities = custom_entity_extraction(row["uuid"],
                                                row["published_date"],
                                                row["scenario_id"], text,
                                                dic_ref)
            values_custom += entities

        if not i % 100:
            logging.info("processed: {}".format(i))

    end = timer()
    logging.info("time elapsed: {}".format(end - start))

    story_entity_df = pd.DataFrame(values,
                                   columns=[
                                       "story_uuid", "text", "label",
                                       "salience", "published_date",
                                       "scenario_id", "wiki", "mentions"
                                   ])

    story_entity_custom_df = pd.DataFrame(values_custom,
                                          columns=[
                                              "story_uuid", "text", "label",
                                              "salience", "published_date",
                                              "scenario_id", "wiki", "mentions"
                                          ])

    # remove conflicting samples from Google EE and PP
    story_entity_df = story_entity_custom_df.append(
        story_entity_df).drop_duplicates(['story_uuid',
                                          'text']).reset_index(drop=True)

    # set character length of 196
    story_entity_df["text"] = story_entity_df["text"].str.slice(0, 196)

    dump_into_entity(story_entity_df.copy())
    story_entity_df.drop(["published_date", "scenario_id"],
                         axis=1,
                         inplace=True)

    # fetch and add existing entities in api_entity
    entity_df = get_entities()

    # unique values by using combination of article uuid and the text
    merged_df = pd.merge(story_entity_df,
                         entity_df,
                         how='left',
                         left_on="text",
                         right_on="legal_name")

    # if it doesn't exists in apis_entity table, and is new generate new uuid
    # and add new entities to apis_storyentityref
    check_label_in_story_ref = set(
        merged_df[merged_df.isna().any(axis=1)]["text"])

    ids_str = "', '".join(check_label_in_story_ref)
    ids_str = "('{}')".format(ids_str)

    query = """
            select "parentID_id", name from entity_alias
            where name in {}
            """.format(ids_str)

    # fetch uuid of existing items and new items and add to merged_df
    # if exists in apis_story_ref, just add ref in map table
    results = connect(query, verbose=False)

    logging.info("{}/{} existing entity_alias found".format(
        len(results), len(check_label_in_story_ref)))

    story_entity_ref_df = pd.DataFrame(
        results, columns=["entity_ref_id", "entity_name"])

    # drop duplicates
    story_entity_ref_df = story_entity_ref_df.drop_duplicates(
        subset='entity_name', keep="first")

    merged_df = pd.merge(merged_df,
                         story_entity_ref_df,
                         how='left',
                         left_on="text",
                         right_on="entity_name")

    merged_df["wiki"].fillna("", inplace=True)

    merged_df.to_csv("merged_df.csv", index=False)
    df.to_csv("df.csv", index=False)
    story_entity_df.to_csv("story_entity_df.csv", index=False)
Ejemplo n.º 17
0
def publish_to_source(**kwargs):
    """
    publishes companies to scrape to the pubsub
    so news aggregator may process the data
    """
    SOURCE_UUID = kwargs["source_uuid"]
    SOURCE = kwargs["source"]

    # load companies which were added to be tracked
    query = """
                select entity.uuid, entity.name, keywords,
                "scenarioID_id", scenario."trackingDays" from
                public.apis_entity entity
                left join
                public.apis_scenario scenario
                on entity."scenarioID_id" = scenario.uuid
                where "entryVerified"=true;
            """

    results = connect(query, verbose=False)

    df = pd.DataFrame(results,
                      columns=[
                          "entity_id", "name", "keywords", "scenario_id",
                          "trackingDays"
                      ])

    df = df.apply(lambda x: get_last_tracked(x, SOURCE), axis=1)

    items_to_insert = []
    for _, row in df.iterrows():
        params = {}
        params["entity_id"] = row["entity_id"]
        params["entity_name"] = row["name"]
        params["scenario_id"] = row["scenario_id"]
        params["source"] = [SOURCE]
        params["storage_bucket"] = BUCKET_NAME
        params["history_processed"] = row["history_processed"]

        if not row["history_processed"]:
            # date from
            date_from = datetime.now() - \
                timedelta(days=int(row["trackingDays"]))

            # date to
            date_to = datetime.now()

            while True:
                date_runner = date_from + timedelta(days=30)

                params["date_from"] = datetime.strftime(date_from, DATE_FORMAT)
                params["date_to"] = datetime.strftime(date_runner, DATE_FORMAT)

                date_from = date_runner

                if date_runner >= date_to:
                    date_to = date_runner - timedelta(days=30)
                    break

                for i in range(0, len(row['keywords']), 2):
                    params["common_names"] = row["keywords"][i:i + 2]
                    success = publish(params)
        else:
            # date from
            date_from = row["last_tracked"]

            # date to
            date_to = datetime.now()
            date_to_write = datetime.strftime(date_to, DATE_FORMAT)
            date_from_write = datetime.strftime(date_from, DATE_FORMAT)
            params["date_from"] = date_from_write
            params["date_to"] = date_to_write

            for i in range(0, len(row['keywords']), 5):
                params["common_names"] = row["keywords"][i:i + 5]
                success = publish(params)

        logging.info("{} - {}".format(row["name"], date_to))
        # if succeeded in publishing update company status & date
        if success:
            items_to_insert.append((
                str(uuid.uuid4()),
                str(date_to),
                params["entity_id"],
                SOURCE_UUID,
            ))

    insert_values(INSERT_QUERY, items_to_insert)