Ejemplo n.º 1
0
def index_source():
    """
    index new domains into our source table
    """

    query = """
    select as2.domain from
    (select distinct "domain" from apis_story) as2
    left join
    apis_source src
    on as2."domain" = src."name"
    where src."name" is null
    """

    new_sources = connect(query)
    values = []
    logging.info("{} new domains".format(len(new_sources)))
    for source in new_sources:
        values.append(((str(uuid.uuid4()), source[0], 1.0)))

    insert_query = """
                    INSERT INTO public.apis_source
                    (uuid, "name", score)
                    VALUES(%s, %s, %s);
                   """

    insert_values(insert_query, values)
Ejemplo n.º 2
0
def get_types_ids(labels):
    """
    Check if there are new types of entities, if there is
    add it to the entity type table and then return the
    UUID for all the entities
    """
    ids_str = "', '".join(labels)
    ids_str = "('{}')".format(ids_str)

    query = """
            select uuid, name from apis_entitytype
            where name in {}
            """.format(ids_str)

    results = connect(query, verbose=False)
    existing_types = [item[1] for item in results]

    new_types = set(labels) - set(existing_types)

    if new_types:
        logging.info("inserting: {}".format(", ".join(new_types)))
        insert_query = """
                    INSERT INTO public.apis_entitytype
                    (uuid, "name") VALUES(%s, %s);"""

        values = []
        for etype in new_types:
            values.append((str(uuid.uuid4()), etype))

        insert_values(insert_query, values)

    results = connect(query, verbose=False)
    types = {item[1]: item[0] for item in results}

    return types
Ejemplo n.º 3
0
def insert_story_entity_map(values):
    query = """
            INSERT INTO public.apis_storyentitymap
            (uuid, "entityID_id", "storyID_id", mentions, salience, created_at)
            VALUES(%s, %s, %s, %s, %s, %s);
            """

    insert_values(query, values)
Ejemplo n.º 4
0
def insert_story_entity_ref(values):
    query = """
            INSERT INTO public.apis_storyentityref
            (uuid, "name", "typeID_id", wikipedia, render, created_at)
            VALUES(%s, %s, %s, %s, %s, %s);
            """

    insert_values(query, values)
Ejemplo n.º 5
0
def insert_entity_alias(values):
    query = """
            INSERT INTO public.entity_alias
            (uuid, "name", wikipedia, score, created_at, "parentID_id", "typeID_id")
            VALUES(%s, %s, %s, %s, %s, %s, %s);
            """

    insert_values(query, values)
Ejemplo n.º 6
0
def insert_entity_scores(df, bucket_ids, model_uuid):
    """
    Insert entity scores into the db by fetching the uuid
    of all the entities present in a particular article
    """

    article_uuids = df["uuid"].unique()

    ids_str = "', '".join(article_uuids)
    ids_str = "('{}')".format(ids_str)

    query = """
    select "entityID_id", "storyID_id" from apis_storyentitymap as2
    where as2."storyID_id" in {}
    """.format(ids_str)

    results = connect(query, verbose=False)

    entity_df = pd.DataFrame(results, columns=["entity_id", "story_id"])

    logging.info("{} articles found".format(entity_df["story_id"].nunique()))
    logging.info("{} entities found".format(entity_df["entity_id"].nunique()))

    # get all unique entities in the articles
    entity_df = entity_df.groupby(["story_id"
                                   ])["entity_id"].apply(set).reset_index()

    df = df.merge(entity_df, how="left", left_on="uuid", right_on="story_id")
    df.drop("story_id", axis=1, inplace=True)

    df["entity_id"] = df.apply(add_article_entity_to_score, axis=1)

    values = []
    """
    for each row
    for each entities
    and for each bucket
    insert the scores
    """
    for _, row in df.iterrows():
        for entity in row["entity_id"]:
            for bucket in bucket_ids.keys():
                log_row = (str(uuid.uuid4()), row["uuid"], row[bucket],
                           bucket_ids[bucket], entity, model_uuid,
                           row["sourceUUID"], str(datetime.now()))
                values.append(log_row)

    logging.info("writing {} articles into entity scores".format(df.shape[0]))
    insert_query = """
        INSERT INTO public.apis_entityscore
        (uuid, "storyID_id", "grossScore", "bucketID_id",
        "entityID_id", "modelID_id", "sourceID_id", "entryTime")
        VALUES(%s, %s, %s, %s, %s, %s, %s, %s);
    """

    insert_values(insert_query, values)
Ejemplo n.º 7
0
def insert_clustermap(CLUSTER_MAP):
    """
    Inserting Clustermaps
    """

    logging.info('Inserting {} cluster mappings'.format(len(CLUSTER_MAP)))

    query = """
            INSERT INTO ml_clustermap (uuid,"clusterID_id","storyID_id",
            published_date) VALUES (%s,%s,%s,%s) ON CONFLICT(uuid)
            DO UPDATE SET "clusterID_id" = EXCLUDED."clusterID_id"
            """

    insert_values(query, CLUSTER_MAP)
Ejemplo n.º 8
0
def insert_cluster(new_clusters):
    """
    Inserting with replacement
    """

    logging.info('Inserting {} new clusters'.format(len(new_clusters)))

    cluster_num = get_max_cluster_num()
    cluster_ids = []

    for uuids in list(np.unique(new_clusters)):
        cluster_num += 1
        cluster_ids.append(tuple([uuids, str(cluster_num)]))

    query = """
            INSERT INTO ml_cluster (uuid,cluster) VALUES(%s,%s)
            """

    insert_values(query, cluster_ids)
Ejemplo n.º 9
0
def sentiment_from_body():
    """
    runs sentiment analysis on each article
    """

    # fetch all stories where body exists and we haven't done
    # sentiment analysis
    query = """
                SELECT story.uuid, body.body FROM
                public.apis_story story
                LEFT JOIN
                (SELECT distinct "storyID_id" FROM public.apis_storysentiment
                WHERE is_headline=false) sentiment
                ON story.uuid = sentiment."storyID_id"
                INNER JOIN (select "storyID_id", (array_agg(body))[1] as body
                from apis_storybody group by "storyID_id") AS body
                ON story.uuid = body."storyID_id"
                WHERE sentiment."storyID_id" IS NULL
                LIMIT 20000
            """

    response = connect(query)
    values = []
    logging.info("extracting entities from {} articles".format(len(response)))

    count = 1
    for story_uuid, body in response:
        sentiment = get_sentiment(body)
        values.append(
            (str(uuid.uuid4()), False, json.dumps(sentiment), story_uuid,
             str(datetime.now())))
        if not count % 100:
            logging.info("processed: {}".format(count))
        count += 1

    insert_query = """
                      INSERT INTO public.apis_storysentiment
                      (uuid, is_headline, sentiment, "storyID_id", "entryTime")
                      VALUES(%s, %s, %s, %s, %s);
                   """

    insert_values(insert_query, values)
    logging.info("finished")
Ejemplo n.º 10
0
def insert_bucket_scores(df, bucket_ids, model_uuid):
    """
    Insert bucket scores into the db
    """
    values = []
    for _, row in df.iterrows():
        for bucket in bucket_ids.keys():
            log_row = (str(uuid.uuid4()), row["uuid"], str(datetime.now()),
                       row[bucket], bucket_ids[bucket], model_uuid,
                       row["sourceUUID"], row["published_date"])
            values.append(log_row)

    logging.info("writing {} articles into bucket scores".format(df.shape[0]))
    insert_query = """
    INSERT INTO public.apis_bucketscore
    (uuid, "storyID_id", "entryTime", "grossScore",
    "bucketID_id", "modelID_id", "sourceID_id", "storyDate")
    VALUES(%s, %s, %s, %s, %s, %s, %s, %s);
    """
    insert_values(insert_query, values)
Ejemplo n.º 11
0
def sentiment_on_headlines():
    """
    runs sentiment analysis on each article
    """

    # fetch titles of all stories we haven't done
    # sentiment analysis
    query = """
               SELECT story.uuid, title FROM
               public.apis_story story
               LEFT JOIN
               (SELECT distinct "storyID_id" FROM public.apis_storysentiment
               WHERE is_headline=true) sentiment
               ON story.uuid = sentiment."storyID_id"
               WHERE sentiment."storyID_id" IS NULL
               LIMIT 20000
            """

    response = connect(query)
    values = []
    logging.info("extracting entities from {} articles".format(len(response)))

    count = 1
    for story_uuid, headline in response:
        sentiment = get_sentiment(headline)
        values.append(
            (str(uuid.uuid4()), True, json.dumps(sentiment), story_uuid,
             str(datetime.now())))
        if not count % 100:
            logging.info("processed: {}".format(count))
        count += 1

    insert_query = """
                      INSERT INTO public.apis_storysentiment
                      (uuid, is_headline, sentiment, "storyID_id", "entryTime")
                      VALUES(%s, %s, %s, %s, %s);
                   """

    insert_values(insert_query, values)
    logging.info("finished")
Ejemplo n.º 12
0
def publish_to_source(**kwargs):
    """
    publishes companies to scrape to the pubsub
    so news aggregator may process the data
    """
    SOURCE_UUID = kwargs["source_uuid"]
    SOURCE = kwargs["source"]

    # load companies which were added to be tracked
    query = """
                select entity.uuid, entity.name, keywords,
                "scenarioID_id", scenario."trackingDays" from
                public.apis_entity entity
                left join
                public.apis_scenario scenario
                on entity."scenarioID_id" = scenario.uuid
                where "entryVerified"=true;
            """

    results = connect(query, verbose=False)

    df = pd.DataFrame(results,
                      columns=[
                          "entity_id", "name", "keywords", "scenario_id",
                          "trackingDays"
                      ])

    df = df.apply(lambda x: get_last_tracked(x, SOURCE), axis=1)

    items_to_insert = []
    for _, row in df.iterrows():
        params = {}
        params["entity_id"] = row["entity_id"]
        params["entity_name"] = row["name"]
        params["scenario_id"] = row["scenario_id"]
        params["source"] = [SOURCE]
        params["storage_bucket"] = BUCKET_NAME
        params["history_processed"] = row["history_processed"]

        if not row["history_processed"]:
            # date from
            date_from = datetime.now() - \
                timedelta(days=int(row["trackingDays"]))

            # date to
            date_to = datetime.now()

            while True:
                date_runner = date_from + timedelta(days=30)

                params["date_from"] = datetime.strftime(date_from, DATE_FORMAT)
                params["date_to"] = datetime.strftime(date_runner, DATE_FORMAT)

                date_from = date_runner

                if date_runner >= date_to:
                    date_to = date_runner - timedelta(days=30)
                    break

                for i in range(0, len(row['keywords']), 2):
                    params["common_names"] = row["keywords"][i:i + 2]
                    success = publish(params)
        else:
            # date from
            date_from = row["last_tracked"]

            # date to
            date_to = datetime.now()
            date_to_write = datetime.strftime(date_to, DATE_FORMAT)
            date_from_write = datetime.strftime(date_from, DATE_FORMAT)
            params["date_from"] = date_from_write
            params["date_to"] = date_to_write

            for i in range(0, len(row['keywords']), 5):
                params["common_names"] = row["keywords"][i:i + 5]
                success = publish(params)

        logging.info("{} - {}".format(row["name"], date_to))
        # if succeeded in publishing update company status & date
        if success:
            items_to_insert.append((
                str(uuid.uuid4()),
                str(date_to),
                params["entity_id"],
                SOURCE_UUID,
            ))

    insert_values(INSERT_QUERY, items_to_insert)