Exemple #1
0
def run(ts):
    """
        This function does:
            * Creates the Tree Schema(germination)
            * Update Nodes(leaves & accumulated) with item_count(count) & avg_popi in schema iteself
            * Creates & updates th_table for given timestamp(ts)
    """
    """ create the tree """
    startTime = time.time()
    pc.printWarn(
        "\t\t .   .   .   .   .   .   .   .   .   ....... Tree Germination in progress .......    .   .   .   .   .   .   .   .   .\n"
    )
    root = TreeGermination()
    pc.printSucc(
        "\t\t <----------------------------------------------- Tree is Germinated ------------------------------------------------>\n"
    )
    """ update leafnodes """
    pc.printWarn(
        "\t\t .   .   .   .   .   .   .   .   .   ....... Updating Leaf(tag) Nodes.......    .   .   .   .   .   .   .   .   .\n"
    )
    updateLeafNodes(ts)
    pc.printSucc(
        "\t\t <--------------------------------------------- Leaf Nodes updated ------------------------------------------------>\n"
    )
    """ update parents """
    pc.printWarn(
        "\t\t .   .   .   .   .   .   .   .   .   ....... Updating Parent Nodes.......    .   .   .   .   .   .   .   .   .\n"
    )
    updateParentNodes(root)
    pc.printSucc(
        "\t\t <--------------------------------------------- Parent Nodes updated ------------------------------------------------>\n"
    )
    """ NOTE: Print the Tree if you want """
    tree_printer_pretty.print_tree(root)
    """ Create & Populate Tag Hotness(TH) Table"""
    pc.printWarn(
        "\t\t .   .   .   .   .   .   .   .   .   ....... Creating & Populating TH Table .......    .   .   .   .   .   .   .   .   .\n"
    )
    create_th(ts)
    update_th_mptt(root, 1, 1, ts)  # update_th_mptt(root,left,level,ts)
    pc.printSucc(
        "\t\t <--------------------------------------------- TH Table Created & Populated ------------------------------------------------>\n"
    )
    """ Update th_table for ItemIDs of wc_table """
    pc.printWarn(
        "\t\t .   .   .   .   .   .   .   .   .   ....... Updating th_table for ItemIDs from wc_table.......    .   .   .   .   .   .   .   .   .\n"
    )
    update_th_table_for_itemIDs(root, ts)
    pc.printSucc(
        "\t\t <--------------------------------------------- th_table now has ItemIDs(HN_IDs,R_IDs) from wc_table ------------------------------------------------>\n"
    )

    endTime = time.time()
    th_table = 'th_' + str(int(ts))
    pc.printWarn(
        "\t\t ---------------> TIME TAKEN FOR th_creating & th_updating@th (sec)   =>  {} => TABLE: {}\n"
        .format(round((endTime - startTime), 5), th_table))
Exemple #2
0
def update_modelTags(ts):
    """
        runs on the table(wc_ts) in wc.db & update ModelTag
    """
    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    pc.printSucc(
        '@[{}] >>>>>> Started  TaggerSimulator@wc ................... => TABLE: {}\n'
        .format(datetime.fromtimestamp(ts), wc_table))
    conn = sqlite3.connect(wc_db, timeout=10)
    c = conn.cursor()
    pc.printMsg(
        "\t -------------------------------------- < TaggerSimulator@wc : DB Connection Opened > ---------------------------------------------\n"
    )
    pc.printWarn("\tRunning PopiCalculator for wc ....... \t NOW: {}".format(
        time.strftime("%H:%M:%S", time.localtime())))
    pc.printWarn(
        "\t\t. .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  ."
    )
    startTime = time.time()

    q = "select * from " + wc_table
    rows_head = c.execute(q)
    rows = rows_head.fetchall()
    for row in rows:
        """
            ============= row is an array with indices: 
            ID(0),SourceSite(1),ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5),Url(6),
            SourceTags(7),ModelTags(8),NumUpvotes(9),NumComments(10),PopI(11),WeightedContent(12),Content(13)
        """
        modelTags = []

        #TODO: call actual Api here, when model is ready
        # pc.printMsg("\t <ID = {}><src= {} > [Tagger] Start................ ".format(row[0],row[1]))

        conf_arr = SimulatorApi(row[13], row[12])
        for item in conf_arr:
            tag = item[0]
            conf = item[1]
            if (conf >= tags_threshold[tag]):
                modelTags.append(tag)
                # pc.printWarn(" \t\t\t\t => Added \t {} \t conf = {}".format(tag,conf))
        modelTags = json.dumps(modelTags)
        query = 'update ' + wc_table + ' set ModelTags = ? where ID = ? and SourceSite = ?'
        data = (modelTags, row[0], row[1])
        c.execute(query, data)

    endTime = time.time()
    conn.commit()
    conn.close()
    pc.printMsg(
        "\t -------------------------------------- < TaggerSimulator@wc: DB Connection Closed > ---------------------------------------------\n"
    )
    pc.printWarn(
        "\t\t ---------------> TIME TAKEN FOR TaggerSimulator@wc(sec)    =>  {} => TABLE: {}\n"
        .format(round((endTime - startTime), 5), wc_table))
Exemple #3
0
def updateLeafNodes(ts):
    """     
        This is the query:
           select count(ID) from wc_1601292562 where ModelTags like "%prog_query%" or SourceTags like "%prog_query%";

    """

    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    pc.printSucc(
        '@[{}] >>>>>> Started  UpdateLeafNodes@wc ................... => TABLE: {}\n'
        .format(datetime.fromtimestamp(ts), wc_table))
    conn = sqlite3.connect(wc_db, timeout=10)
    c = conn.cursor()
    pc.printMsg(
        "\t -------------------------------------- < UpdateLeafNodes@wc : DB Connection Opened > ---------------------------------------------\n"
    )
    pc.printWarn("\tRunning UpdateLeafNodes for wc ....... \t NOW: {}".format(
        time.strftime("%H:%M:%S", time.localtime())))
    pc.printWarn(
        "\t\t. .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  ."
    )
    startTime = time.time()

    for tag in tags_names:
        q = 'select count(ID) from ' + wc_table + ' where ModelTags like ? or SourceTags like ?'
        d = (
            '%"{}"%'.format(tag),
            '%"{}"%'.format(tag),
        )
        item_count = c.execute(q, d)
        item_count = c.fetchone()[0]
        q = 'select avg(PopI) from ' + wc_table + ' where ModelTags like ? or SourceTags like ?'
        avg_popi = c.execute(q, d)
        avg_popi = c.fetchone()[0]
        if avg_popi == None:
            avg_popi = 0
        else:
            avg_popi = round(avg_popi, 10)
        curr_node = node_dict[tag]
        if curr_node.isTag:  #update only if its a leaf
            curr_node.count = item_count
            curr_node.popi = avg_popi
            pc.printSucc(
                " \t\t\t..... Updated node: {}  \t => c = {}  , p = {}".format(
                    curr_node.name, item_count, avg_popi))

    endTime = time.time()
    conn.commit()
    conn.close()
    pc.printMsg(
        "\t -------------------------------------- < UpdateLeafNodes@wc: DB Connection Closed > ---------------------------------------------\n"
    )
    pc.printWarn(
        "\t\t ---------------> TIME TAKEN FOR UpdateLeafNodes In Tree  (sec)   =>  {} \n"
        .format(round((endTime - startTime), 5)))
def run(ts):
    startTime = time.time()

    try:
        run_wc(ts)
    except Exception as e:
        pc.printErr(" xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running PopICalculator for wc table xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\t>>> Error = {}".format(str(e)))
        logging.error(traceback.format_exc())
        pass

    try:
        run_wp(ts)
    except Exception as e:
        pc.printErr(" xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running PopICalculator for wc table xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\t>>> Error = {}".format(str(e)))
        logging.error(traceback.format_exc())
        pass
    
    endTime = time.time()

    pc.printSucc("**************************** PopI Calculation is Done for wc & wp ********************************\n\n")
    pc.printWarn("| \t\t TIME TAKEN FOR PopICalculators-both     \t\t | \t\t {}  \t\t |".format(round((endTime - startTime),5)))
    pc.printSucc("*************************************************************************************************\n\n")

    pc.printSucc("\n\n***************************** PopI Calculation is Complete.************************")
    print("\n\n")
    table = PrettyTable(['Entity (Post PopI Calculation)', 'Value'])
    table.add_row(['TIME TAKEN FOR PopICalculators(wc & wp) (min)', round((endTime - startTime)/60,2)])
    pc.printSucc(table)
    print("\n\n")
Exemple #5
0
def RunAsync(ts):
    """
        Pick wc-db's table mapped with `ts` and scrapes (useful) "clean" Content & WeightedContent from url- ASYNCLY
        * NOTE:
            * If conent is already present in the table, "clean" it too & append the newly scraped content to it.
            * FIRST RUN: time = 17 hours, data = 12 MB, #entries = 6.5k
        Input: ts (format: 1598692058.887741)
    """

    # pc.printMsg('@[{}] >>>>>> Started Content-scraper(ASYNC) .......[Sema = 10, conn_lim =10]............ => FILENAME: {}\n'.format(datetime.fromtimestamp(ts),'dbs/wc-db/wc_table_'+str(int(ts))+'_wc.csv'))
    pc.printMsg(
        '@[{}] >>>>>> Started Content-scraper(ASYNC) .......[Sema = 50, conn_lim =50]............ => FILENAME: {}\n'
        .format(datetime.fromtimestamp(ts),
                'dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv'))

    stratTime = time.time()
    csv_src_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str(
        int(ts)) + '.csv'
    csv_dest_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str(
        int(ts)) + '_wwccc100-8.csv'

    # Run the async job
    asyncio.get_event_loop().run_until_complete(
        asyncio.ensure_future(asyncFetchAll(csv_src_file, csv_dest_file)))

    endTime = time.time()
    pc.printSucc(
        "\n****************** Content Scraping is Complete , FILENAME: {} ********************\n \t\t ===========> TIME TAKEN = {}"
        .format('dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv',
                (endTime - stratTime)))

    pc.printMsg(
        "\n------------------------------------------------------------------------"
    )
    pc.printMsg("|\tENTRIES_TO_BE_WRITTEN        \t  | \t {} \t|".format(
        ENTRIES_TO_BE_WRITTEN))
    pc.printMsg("|\tWRITTEN_ENTRIES_ASYNC_DIRECT \t  | \t {} \t|".format(
        WRITTEN_ENTRIES_ASYNC_DIRECT))
    pc.printMsg("|\tWRITTEN_ENTRIES_ASYNC_SCRAPED\t  | \t {} \t|".format(
        WRITTEN_ENTRIES_ASYNC_SCRAPED))
    pc.printMsg("|\tSKIPPED_ASYNC                \t  | \t {} \t|".format(
        SKIPPED_ASYNC))
    pc.printMsg(
        "|\tFAILED_ASYNC                 \t  | \t {} \t|".format(FAILED_ASYNC))
    pc.printMsg(
        "--------------------------------------------------------------------------\n"
    )
Exemple #6
0
async def fetchWithRetry(row, session):
    status = 400
    retry_cnt = 3
    sleep_time = 10
    TIMEOUT = 60

    while retry_cnt > 0 and status != 200:
        async with session.get(row["Url"],
                               ssl=ssl.create_default_context(
                                   purpose=ssl.Purpose.CLIENT_AUTH),
                               timeout=TIMEOUT) as response:
            res = await response.text()
            status = response.status
            if (status == 200 and len(res) != 0):
                pc.printSucc(
                    "\t\t <ID = {}><src= {} > ============== Scraping Done....... \t NOW: {}"
                    .format(row["ID"], row["SourceSite"],
                            time.strftime("%H:%M:%S", time.localtime())))
                urlstrings = text_actions.getUrlString(row["Content"])
                row["WeightedContent"] = text_actions.clean_text(
                    text_actions.weightedcontentfromhtml(res) + row["Title"] +
                    urlstrings)
                row["Content"] = text_actions.clean_text(
                    text_actions.contentfromhtml(res) + urlstrings)
                if (len(row["Content"]) == 0):
                    row["WeightedContent"] = text_actions.clean_text(
                        row["Title"])
                    row["Content"] = text_actions.clean_text(row["Title"])
                # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row["ID"],row["SourceSite"],time.strftime("%H:%M:%S", time.localtime())))
                # time.sleep(0.001)
                return row
            else:
                retry_cnt -= 1
                pc.printWarn(
                    "\t x---------------- Unable to hit URL(ERR_CODE={}): {}  Sleeping for {} Retries remaining = {} -------------x"
                    .format(status, row["Url"], sleep_time, retry_cnt))
                await asyncio.sleep(sleep_time)
    pc.printErr(
        "\t\txxxxx SKIPPING... for <ID = {}><src= {} > Unable to hit url: {} , "
        .format(row["ID"], row["SourceSite"], row["Url"]))
    global SKIPPED_ASYNC
    SKIPPED_ASYNC += 1
    return row
Exemple #7
0
def create_th(ts):
    """
        Just creates the th_table(Topic Hotness); if not exists already
    """
    th_db = 'dbs/th.db'
    th_table = 'th_' + str(int(ts))
    conn = sqlite3.connect(th_db, timeout=10)
    c = conn.cursor()
    pc.printMsg(
        "\t -------------------------------------- < Create_th: DB Connection Opened > ---------------------------------------------\n"
    )
    c.execute(
        "SELECT count(name) FROM sqlite_master WHERE type='table' AND name='{}'"
        .format(th_table))
    if c.fetchone()[0] == 1:  # table exists, flush away!
        c.execute("delete from {}".format(th_table))
    else:  # creting new table
        c.execute(
            "CREATE TABLE {} (ID, NodeName, LeftMptt, RightMptt, DepthLevel, ItemCount, AvgPopI, HN_IDs,R_IDs)"
            .format(th_table))

    index = 1
    q = 'INSERT INTO ' + th_table + ' VALUES (?,?,?,?,?,?,?,?,?)'
    for node_name in node_dict:
        query_from_tree = queryTreeNodeForCountNPopi(node_name)
        d = (index, node_name, -1, -1, 0, query_from_tree[0],
             query_from_tree[1], '', '')
        c.execute(q, d)
        index += 1

    conn.commit()
    conn.close()
    pc.printMsg(
        "\t -------------------------------------- < Create_th: DB Connection Closed > ---------------------------------------------\n"
    )
    pc.printSucc(
        "\t **************************************** TH Table Created: {} ******************************************************\n"
        .format(th_table))
Exemple #8
0
def run(ts):
    startTime = time.time()

    try:
        update_modelTags(ts)
    except Exception as e:
        pc.printErr(
            " xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running Tagger Simulator for wc table xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\t>>> Error = {}"
            .format(str(e)))
        logging.error(traceback.format_exc())
        pass

    endTime = time.time()

    pc.printSucc(
        "**************************** Tagger(Simulator) Run is Complete for wc **********************************************"
    )
    pc.printWarn(
        "| \t\t TIME TAKEN FOR Tagger(Simulator) Run(sec)     \t\t | \t\t {}  \t\t |"
        .format(round((endTime - startTime), 5)))
    pc.printSucc(
        "***********************************************************************************************************************\n\n"
    )
Exemple #9
0
def run(ts):
    """
        Scrapes PH api for last 7 days & puts data in WP-DB.
            * Api supports daywaise only. So scrape for one day at a time
            * Link to documentation: https://api.producthunt.com/v1/docs/posts/posts_index_request_a_specific_day_with_the_%60day%60_parameter_(tech_category)
        * NOTE:
            * No threshold set on upvotes or comments rn.Maybe later?
            * API-Ratelimit: You can make up to 900 requests every 15 minutes, else gives `status 429` in response.If that happens, wait for 16 mins, then hit again.   
                * Retry 2 times; if failed nonetheless, skip!
            * Content = Tagline
            * URL: is the PH url only. Going to the product page & then finding the actual link is overkill
                * (this could also help later on getting their permission while monetizing)
            * Used self-retry logic. but check this package:: Read about requests.retries here: [doc](https://findwork.dev/blog/advanced-usage-python-requests-timeouts-retries-hooks/#retry-on-failure), [stkofw](https://stackoverflow.com/questions/23267409/how-to-implement-retry-mechanism-into-python-requests-library?rq=1)
        Input: ts (format: 1598692058.887741)

        * ============= row is an array with indices: 
        (ID(0), SourceSite(1), ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5), Url(6),ThumbnailUrl(7),SourceTags(8),NumUpvotes(9),NumComments(10),PopI(11),Content(12))
    """

    wp_db = 'dbs/wp.db'
    wp_table = 'wp_' + str(int(ts))
    pc.printSucc(
        '@[{}] >>>>>> Started PH-scraper ................... => TABLE: {}\n'.
        format(datetime.fromtimestamp(ts), wp_table))
    conn = sqlite3.connect(wp_db, timeout=10)
    c = conn.cursor()
    pc.printMsg(
        "\t -------------------------------------- < PH_SCRAPER: DB/wp Connection Opened > ---------------------------------------------\n"
    )
    startTime = time.time()
    """
        here is how you add day to `ts`:

        from datetime import datetime, timedelta
        newts = datetime.fromtimestamp(ts) + timedelta(days=1) # 2020-08-30 16:02:34.352094
        newts.timestamp() # 1598783633.284871
        datetime.fromtimestamp(ts) #2020-08-29 17:15:32
        # get date from it: 
        datetime.fromtimestamp(ts).date() #2020-08-29
    """
    """ days_arr has last 7 days(including today's) (YYYY-MM-DD)date strings ; just the way PH's API needs
    """
    curr_date = str(int(ts))
    days_arr = [str(datetime.fromtimestamp(int(ts)).date())]  # '2020-08-29'

    for i in range(6):
        new_ts = datetime.fromtimestamp(int(curr_date)) + timedelta(days=-1)
        new_ts = new_ts.timestamp()
        curr_date = new_ts
        days_arr.append(str(datetime.fromtimestamp(int(new_ts)).date()))

    PH_REQ_HEADERS = {
        "Accept": "application/json",
        "Content-Type": "application/json",
        "Authorization": "Bearer " + vault.PH_ACCESS_TOKEN,
        "Host": "api.producthunt.com"
    }

    # csv_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wp-db/wp_table_'+str(int(ts))+'.csv'
    index = gw.WP_TOTAL_ENTRIES_YET + 1

    for date in days_arr:
        pc.printMsg(
            " ................. scraping for date =  {} .................\n".
            format(date))
        url = 'https://api.producthunt.com/v1/posts?day=' + date
        try:
            data = web_requests.hitGetWithRetry(url, PH_REQ_HEADERS, False, 2,
                                                5, 10)
            if (data == -1):
                pc.printErr(
                    "\t\txxxxxx Unable to hit {} after 2 retries.Skipping this date( {} ) xxxxxx\n"
                    .format(url, date))
            else:
                items_arr = json.loads(data.content)["posts"]
                for item in items_arr:
                    # print(json.dumps(item, indent = 4))
                    """ get all the tags attached along with the item """
                    source_tags = []
                    for tag in item["topics"]:
                        source_tags.append(tag["name"])
                    entry = [
                        index, "PH",
                        datetime.fromtimestamp(ts).date(),
                        int(ts),
                        date_conversion.PHDate(str(item["created_at"])),
                        item["name"], item["discussion_url"],
                        item["thumbnail"]["image_url"],
                        json.dumps(source_tags), item["votes_count"],
                        item["comments_count"], '', item["tagline"]
                    ]
                    # csv_functions.putToCsv(csv_file,entry)
                    c.execute(
                        'INSERT INTO ' + wp_table +
                        ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
                    index = index + 1
                    gw.PH_TOTAL_ITEMS_GOT_YET += 1

        except Exception as e:
            pc.printErr(
                " \t xxxxxxxxxxxxx ERROR@PH_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n"
                .format(index, e))
            logging.error(traceback.format_exc())
            pass

        pc.printMsg("\t\t\t ====>> TOTAL_ENTRIES_YET = {}".format(
            gw.PH_TOTAL_ITEMS_GOT_YET))

    gw.WP_TOTAL_ENTRIES_YET += gw.PH_TOTAL_ITEMS_GOT_YET

    endTime = time.time()
    conn.commit()
    conn.close()
    pc.printMsg(
        "\t -------------------------------------- < PH_SCRAPER: DB/wp Connection Closed > ---------------------------------------------\n"
    )

    pc.printSucc(
        "\n\n***************************** PH Url Scraping is Complete. TABLE: {} ******************"
        .format(wp_table))
    print("\n\n")
    table = PrettyTable(['Entity (Post PH URL Scraping)', 'Value'])
    table.add_row(['TOTAL URLS FETCHED by PH', gw.PH_TOTAL_ITEMS_GOT_YET])
    table.add_row(['TOTAL ITEMS IN WP TABLE YET', gw.WP_TOTAL_ENTRIES_YET])
    table.add_row([
        'TIME TAKEN FOR URL SCRAPING-PH (sec) ',
        round((endTime - startTime), 5)
    ])
    pc.printSucc(table)
    print("\n\n")
async def fetchWithRetry(row, session):
    """
        Hits ulr(with retires):
        * if status == 200:
            return resposne ((raw)Content & (raw)WeightedContent in row)
        * if still unable to hit after retries: Content = Title , WeightedContent = Title
        INPUT: `row` is an array with indices: 
            ID(0),SourceSite(1),ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5),Url(6),
            SourceTags(7),ModelTags(8),NumUpvotes(9),NumComments(10),PopI(11),WeightedContent(12),Content(13)
    """

    status = 400
    retry_cnt = 2
    sleep_time = 5
    # TIMEOUT = ClientTimeout(total=20)
    TIMEOUT = 20

    while retry_cnt > 0 and status != 200:
        async with session.get(row[6],
                               ssl=ssl.create_default_context(
                                   purpose=ssl.Purpose.CLIENT_AUTH),
                               timeout=TIMEOUT) as response:
            res = await response.text()
            # res = await response.content.read()
            # res = await text_actions.clean_text(str(response.content.read()))
            res = text_actions.clean_text(str(res))
            # res = res.encode('utf8', 'ignore').decode('utf8', 'ignore')                   #FIXME: not working
            status = response.status
            if (status == 200 and len(res) != 0):
                pc.printSucc(
                    "\t\t <ID = {}><src= {} > ============== #Scraped ....... \t NOW: {}"
                    .format(row[0], row[1],
                            time.strftime("%H:%M:%S", time.localtime())))
                row_list = list(row)
                row_list[12] = text_actions.weightedcontentfromhtml(res)
                row_list[13] = text_actions.contentfromhtml(res)
                # for i in range(len(row_list)):
                #     row_list[i] = row_list[i].decode("utf-8", "ignore")

                row = tuple(row_list)
                # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row[0],row[1],time.strftime("%H:%M:%S", time.localtime())))
                # time.sleep(0.001)
                if (len(row[13]) == 0):
                    global ERR_ASYNC_NO_CONTENT_IN_SCRAPING
                    ERR_ASYNC_NO_CONTENT_IN_SCRAPING += 1
                    pc.printErr(
                        "\t\t xxxxxxxxxxxxxxxxxxx SKIPPING  for <ID = {}><src= {} > As No Content even after scraping xxxxxxxxxxxxxxxxxxxxxxxx\n"
                        .format(row[0], row[1]))
                return row
            else:
                retry_cnt -= 1
                pc.printWarn(
                    "\t x---------------- <ID = {}><src= {} > Unable to hit URL(ERR_CODE={}): {}.........  Sleeping for {} Retries remaining = {} -------------x"
                    .format(row[0], row[1], status, row[6][:25], sleep_time,
                            retry_cnt))
                await asyncio.sleep(sleep_time)

    pc.printErr(
        "\t\txxxxx  For <ID = {}><src= {} >Totally unable to hit url.... using Title for Content & WeightedContent : {} "
        .format(row[0], row[1], row[6]))
    global ERR_ASYNC_ON_URL_ERROR
    ERR_ASYNC_ON_URL_ERROR += 1
    pc.printMsg(
        " \t\t\t ============== [Unreachable URL] Will write anyways. <ID = {}><src= {} > =============== "
        .format(row[0], row[1]))
    return row
async def fetchWithRetry(row, session, csv_out):
    """
        Hits ulr(with retires):
        * if status == 200:
            put content into csv
        * if still unable to hit after retries: Content = Title , WeightedContent = Title
    """

    status = 400
    retry_cnt = 2
    sleep_time = 10
    TIMEOUT = 10

    while retry_cnt > 0 and status != 200:
        async with session.get(row["Url"],
                               ssl=ssl.create_default_context(
                                   purpose=ssl.Purpose.CLIENT_AUTH),
                               timeout=TIMEOUT) as response:
            res = await response.text()
            status = response.status
            if (status == 200 and len(res) != 0):
                pc.printSucc(
                    "\t\t <ID = {}><src= {} > ============== Scraping Done....... \t NOW: {}"
                    .format(row["ID"], row["SourceSite"],
                            time.strftime("%H:%M:%S", time.localtime())))
                urlstrings = text_actions.getUrlString(row["Content"])
                row["WeightedContent"] = text_actions.weightedcontentfromhtml(
                    res) + row["Title"] + urlstrings
                row["Content"] = text_actions.contentfromhtml(res) + urlstrings
                # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row["ID"],row["SourceSite"],time.strftime("%H:%M:%S", time.localtime())))
                # time.sleep(0.001)
                if (len(row["Title"]) != 0):
                    if len(row["Content"]) == 0:
                        row["WeightedContent"] = row["Title"]
                        row["Content"] = row["Title"]
                    await write_result(csv_out, row)
                    global WRITTEN_ENTRIES_ASYNC_SCRAPED
                    WRITTEN_ENTRIES_ASYNC_SCRAPED += 1
                    pc.printMsg(
                        " \t\t ============== [Scraped] Done Writing into csv for <ID = {}><src= {} > =============== "
                        .format(row["ID"], row["SourceSite"]))
                else:
                    global WRITTEN_ENTRIES_ASYNC_NO_CONTENT_IN_SCRAPING
                    WRITTEN_ENTRIES_ASYNC_NO_CONTENT_IN_SCRAPING += 1
                    pc.printErr(
                        "\t\t xxxxxxxxxxxxxxxxxxx SKIPPING  for <ID = {}><src= {} > As No Title xxxxxxxxxxxxxxxxxxxxxxxx\n"
                        .format(row["ID"], row["SourceSite"]))
                return row
            else:
                retry_cnt -= 1
                pc.printWarn(
                    "\t x---------------- <ID = {}><src= {} > Unable to hit URL(ERR_CODE={}): {}.........  Sleeping for {} Retries remaining = {} -------------x"
                    .format(row["ID"], row["SourceSite"], status,
                            row["Url"][:25], sleep_time, retry_cnt))
                await asyncio.sleep(sleep_time)
    pc.printErr(
        "\t\txxxxx  For <ID = {}><src= {} >Totally unable to hit url.... using Title for Content & WeightedContent : {} "
        .format(row["ID"], row["SourceSite"], row["Url"]))
    if len(row["Content"]) == 0:
        row["WeightedContent"] = row["Title"]
        row["Content"] = row["Title"]
    await write_result(csv_out, row)
    global WRITTEN_ENTRIES_ASYNC_ON_URL_ERROR
    WRITTEN_ENTRIES_ASYNC_ON_URL_ERROR += 1
    pc.printMsg(
        " \t\t\t ============== [Unreachable URL] Done Writing into csv for <ID = {}><src= {} > =============== "
        .format(row["ID"], row["SourceSite"]))
    return row
def RunAsync(ts):
    """
        Pick wc-db's table mapped with `ts` and scrapes (useful) "clean" Content & WeightedContent from url- ASYNCLY
        * NOTE:
            * If conent is already present in the table, "clean" it too & append the newly scraped content to it.
            * FIRST RUN: time = 17 hours, data = 12 MB, #entries = 6.5k
        Input: ts (format: 1598692058.887741)
    """
    global CONNTECTION_COUNT, SEMAPHORE_COUNT
    wc_table = 'wc_' + str(int(ts))
    pc.printMsg(
        '@[{}] >>>>>> Started Content-scraper(ASYNC) .......[Sema = {}, conn_lim ={}]............ => TABLE: {}\n'
        .format(datetime.fromtimestamp(ts), SEMAPHORE_COUNT, CONNTECTION_COUNT,
                wc_table))

    stratTime = time.time()
    # csv_src_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_'+str(int(ts))+'.csv'
    # csv_dest_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_'+str(int(ts))+'_wc.csv'

    # Run the async job
    asyncio.get_event_loop().run_until_complete(
        asyncio.ensure_future(asyncFetchAll(ts)))

    endTime = time.time()
    pc.printSucc(
        "\n****************** (Async)Content Scraping is Complete , TABLE: {} ********************"
        .format(wc_table))

    pc.printMsg(
        "\n--------------------------------------------------------------------------------------------------------------------------------"
    )
    pc.printMsg(
        "|\t\t IN : Total Entries in Url-Scraped Output Table                   \t\t  | \t\t {} \t\t|"
        .format(ENTRIES_TO_BE_WRITTEN))
    pc.printMsg(
        "|\t\t OUT: WRITTEN_ENTRIES_ASYNC_DIRECT(content exists)                \t\t  | \t\t {} \t\t|"
        .format(WRITTEN_ENTRIES_ASYNC_DIRECT))
    pc.printMsg(
        "|\t\t OUT: WRITTEN_ENTRIES_ASYNC_SCRAPED(scraped entries)              \t\t  | \t\t {} \t\t|"
        .format(WRITTEN_ENTRIES_ASYNC_SCRAPED))
    pc.printErr(
        "\n\n------------------ ERRORS In Scraping (Written nonetheless; counted in  WRITTEN_ENTRIES_ASYNC_SCRAPED) --------------------------\n"
    )
    pc.printMsg(
        "================================================================================================================================="
    )
    pc.printErr(
        "|\t\t ERR_ASYNC_NO_CONTENT_IN_SCRAPING(url hit;not content-written )   \t\t  | \t\t {} \t\t|"
        .format(ERR_ASYNC_NO_CONTENT_IN_SCRAPING))
    pc.printErr(
        "|\t\t ERR_ASYNC_ON_URL_ERROR(url not hit)                              \t\t  | \t\t {} \t\t|"
        .format(ERR_ASYNC_ON_URL_ERROR))
    pc.printErr(
        "|\t\t ERR_ASYNC_TRIED_ERR(other try/catch errs)                        \t\t  | \t\t {} \t\t|"
        .format(ERR_ASYNC_TRIED_ERR))
    pc.printMsg(
        "---------------------------------------------------------------------------------------------------------------------------------\n"
    )
    pc.printWarn(
        '\t\t\t\t------------------------->>>>>> [ Semaphore Count = {}, Tcp connector limit ={} ]\n'
        .format(SEMAPHORE_COUNT, CONNTECTION_COUNT))
    pc.printWarn(
        '\t\t\t\t------------------------->>>>>> [ Time Taken(sec) = {} ]\n'.
        format(int(endTime - stratTime)))
def run_wp(ts):
    """
        runs on the table(wp_ts) in wp.db & updates PopI column in it
    """

    wp_db = 'dbs/wp.db'
    wp_table = 'wp_' + str(int(ts))
    pc.printSucc('@[{}] >>>>>> Started  PopICalculator@wp ................... => TABLE: {}\n'.format(datetime.datetime.fromtimestamp(ts),wp_table))
    conn = sqlite3.connect(wp_db, timeout=10)
    c = conn.cursor()
    pc.printMsg("\t -------------------------------------- <  PopICalculator@wp : DB/wp Connection Opened > ---------------------------------------------\n")
    startTime = time.time()
    pc.printWarn("\tRunning PopiCalculator for wp ....... \t NOW: {}".format(time.strftime("%H:%M:%S", time.localtime())))
    pc.printWarn("\t\t. .  .  .  .  .  .  .  .  .  .  .  .  ....... PopI Calculation for wp table Started .......  .  .  .  .  .   .  .  .  .  .  .  .  .  .  .  .  .  .  .")

    days = GetLastSevenDays(ts)

    """ Initialize both maps(weekly & daily): key = PopiItem, Value = (max_upvotes, max_comments) """

    DailyMaxMap = collections.defaultdict(list)
    WeeklyMaxMap = collections.defaultdict(list)

    q = "select * from " + wp_table
    rows_head = c.execute(q)
    rows = rows_head.fetchall()
    for row in rows:
        """
           * ============= row is an array with indices: 
            (ID(0), SourceSite(1), ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5), Url(6),
            ThumbnailUrl(7),SourceTags(8),NumUpvotes(9),NumComments(10),PopI(11),Content(12))
        """
        popi_item_daily = PopiItem(row[1],row[4])
        popi_item_weekly = PopiItem(row[1],row[2])

        # for daily max
        if popi_item_daily in DailyMaxMap:
            max_upvotes_day = DailyMaxMap[popi_item_daily][0]
            max_comments_day = DailyMaxMap[popi_item_daily][1]
        else:
            q = "select max(NumUpvotes) from " + wp_table + " where SourceSite = ? and CreationDate = ?"
            d = (row[1],row[4])
            max_upvotes_day = c.execute(q,d)
            max_upvotes_day = c.fetchone()[0]
            q = "select max(NumComments) from " + wp_table + " where SourceSite = ? and CreationDate = ?"
            max_comments_day = c.execute(q,d)
            max_comments_day = c.fetchone()[0]
            DailyMaxMap[popi_item_daily] = (max_upvotes_day,max_comments_day)

        # For weekly max
        if popi_item_weekly in WeeklyMaxMap:
            max_upvotes_week = WeeklyMaxMap[popi_item_daily][0]
            max_comments_week = WeeklyMaxMap[popi_item_daily][1]
        else:
            q = "select max(NumUpvotes) from " + wp_table + " where SourceSite = ? and ProcessingDate = ?"
            d = (row[1],row[2])
            max_upvotes_week = c.execute(q,d)
            max_upvotes_week = c.fetchone()[0]
            q = "select max(NumComments) from " + wp_table + " where SourceSite = ? and ProcessingDate = ?"
            max_comments_week = c.execute(q,d)
            max_comments_week = c.fetchone()[0]
            WeeklyMaxMap[popi_item_weekly] = (max_upvotes_week,max_comments_week)

        popI = CalculatePopi(row[9],row[10],max_upvotes_day, max_comments_day, max_upvotes_week, max_comments_week,row[4],days[6],row[1])
        # pc.printWarn(" \t\t [wc_popi calculation] <ID={}><Source={}> ...................... PopI = {}".format(row[0],row[1],popI))
        # pc.printMsg("\t\t\t\t ........................ Updated PopI in wp_table..............")
        query = 'update ' + wp_table + ' set PopI = ? where ID = ? and SourceSite = ?'
        data = (popI,row[0],row[1])
        c.execute(query,data)

    endTime = time.time()
    conn.commit()
    conn.close()
    pc.printMsg("\t -------------------------------------- < PopICalculator@wp: DB/wp Connection Closed > ---------------------------------------------\n")
    pc.printWarn("\t\t ---------------> TIME TAKEN FOR PopICalculator@wp    =>  {} => TABLE: {}\n".format(round((endTime - startTime),5),wp_table))
def ContentFormatting(ts):
    """ 
    Do:
        0. Update Content & WeightedContent column for each row
        1. get url_strings_content = getUrlString(row[13]) -> add it in weighted_content
        2. do clean_text(row[13])
        2. do clean_text(row[12])
        3. clean text clean_text(row[5]) -> add it in weighted_content :: clean_text(row[12]) + " " + clean_title + " " + url_strings_content
        4. if content col is still null; put title into it & in weightedContent too
    """

    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    conn = sqlite3.connect(wc_db)
    c = conn.cursor()
    pc.printMsg(
        "\t -------------------------------------- < Content Formatter: DB/wc Connection Opened > ---------------------------------------------\n"
    )
    startTime = time.time()
    pc.printWarn("\tRunning ContentFormatter for wc ....... \t NOW: {}".format(
        time.strftime("%H:%M:%S", time.localtime())))
    pc.printWarn(
        "\t\t. .  .  .  .  .  .  .  .  .  .  .......... Content Formatting Started @Content_Scraper ...........  .  .  .  .  .  .  .  .  .  .  ."
    )

    signal.signal(signal.SIGALRM,
                  timeout_handler)  # timeouts on few function calls, see below
    q = "select * from " + wc_table
    rows_head = c.execute(q)
    rows = rows_head.fetchall()
    conn.commit()
    for row in rows:
        t1 = time.time()
        row_list = list(row)
        if (len(row[13]) != 0):
            gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_OK += 1
            clean_title = clean_text(row_list[5])
            if len(row_list[13]) == 0:
                pc.printWarn(
                    "\t\t\t\t --------- No content found on cleaning, using Title as Content :("
                )
                row_list[13] = clean_title
                row_list[12] = clean_title
            else:
                raw_content = row_list[13]
                signal.alarm(200)  # Timeout of 200 sec on function call
                content = clean_title  # if timeout happens, this will be the value of content
                try:
                    content = text_actions.contentfromhtml(raw_content)
                except Exception as exc:
                    pc.printErr(
                        "\t <ID = {}><src= {} > Timeout of 200 sec happened on CONTENT@ContentFromHtml ! ....using Title as content "
                        .format(row[0], row[1]))
                    # pc.printWarn(exc)
                    pass

                signal.alarm(200)  # Timeout of 200 sec on function call
                clean_content = clean_title  # if timeout happens, this will be the value of content
                try:
                    clean_content = clean_text(content)
                except Exception as exc:
                    pc.printErr(
                        "\t <ID = {}><src= {} > Timeout of 200 sec happened on CONTENT@CleanText ! ....using Title as content "
                        .format(row[0], row[1]))
                    # pc.printWarn(exc)
                    pass

                signal.alarm(200)  # Timeout of 200 sec on function call
                weighted_content = clean_title  # if timeout happens, this will be the value of content
                try:
                    weighted_content = text_actions.weightedcontentfromhtml(
                        raw_content)
                except Exception as exc:
                    pc.printErr(
                        "\t <ID = {}><src= {} > Timeout of 200 sec happened on WEIGHTED_CONTENT@WeightedContentFromHtml ! ....using Title as weightedcontent "
                        .format(row[0], row[1]))
                    # pc.printWarn(exc)
                    pass

                signal.alarm(200)  # Timeout of 200 sec on function call
                clean_weighted_content = clean_title  # if timeout happens, this will be the value of content
                try:
                    clean_weighted_content = clean_text(weighted_content)
                except Exception as exc:
                    pc.printErr(
                        "\t <ID = {}><src= {} > Timeout of 200 sec happened on WEIGHTED_CONTENT@CleanText ! ....using Title as weightedcontent "
                        .format(row[0], row[1]))
                    # pc.printWarn(exc)
                    pass

                signal.alarm(200)  # Timeout of 200 sec on function call
                url_string_text = ''  # if timeout happens, this will be the value of content
                try:
                    url_string_text = getUrlString(raw_content)
                except Exception as exc:
                    pc.printErr(
                        "\t <ID = {}><src= {} > Timeout of 200 sec happened on URL_STRING@getUrlString ! ....using empty str as url_string_text "
                        .format(row[0], row[1]))
                    # pc.printWarn(exc)
                    pass

                row_list[13] = clean_content
                row_list[
                    12] = clean_weighted_content + " " + url_string_text + " " + clean_title

            row = tuple(row_list)

            pc.printWarn(
                "\t <ID = {}><src= {} > [Content Formatting] Done................ \t\t TimeTaken = {} \t NOW: {}"
                .format(row[0], row[1], round((time.time() - t1), 5),
                        time.strftime("%H:%M:%S", time.localtime())))
            content = row[13]
            q = 'update ' + wc_table + ' set Content = ?, WeightedContent = ?  where ID = ? and SourceSite = ?'
            d = (row[13], row[12], row[0], row[1])
            c.execute(q, d)
            conn.commit()
            # pc.printSucc(" \t\t ============== <ID= {} ><{}> [Content Formatting]-with content INSERTED INTO TABLE =============== ".format(row[0],row[1]))
        else:  #No content
            gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_NO_CONTENT += 1
            pc.printMsg(
                "\t <ID = {}><src= {} > [Content Formatting] No content.Using title finally................ \t\t TimeTaken = {} \t NOW: {}"
                .format(row[0], row[1], round((time.time() - t1), 5),
                        time.strftime("%H:%M:%S", time.localtime())))
            clean_title = clean_text(row_list[5])
            content = clean_title
            q = 'update ' + wc_table + ' set Content = ?, WeightedContent = ?  where ID = ? and SourceSite = ?'
            d = (content, content, row[0], row[1])
            c.execute(q, d)
            conn.commit()
            # pc.printSucc(" \t\t ============== <ID= {} ><{}> [Content Formatting]-without content INSERTED INTO TABLE =============== ".format(row[0],row[1]))
    endTime = time.time()

    conn.close()
    pc.printMsg(
        "\t -------------------------------------- < Content Formatter: DB/wc Connection Closed > ---------------------------------------------\n"
    )

    pc.printSucc(
        "\n\n***************************** Content Formatting is Complete. TABLE: {} ******************"
        .format(wc_table))
    print("\n\n")
    table = PrettyTable(
        ['Success (Post Content Formatting)', 'Notation(if any)', 'Value'])
    table.add_row([
        'IN : gw.WC_TOTAL_URL_ENTRIES ', '[X] (A+B+C=X)',
        gw.WC_TOTAL_URL_ENTRIES
    ])
    table.add_row([
        'OUT : ITEMS PUT IN WITH SCRAPED CONTENT', '[P] (P+Q=X)',
        gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_OK
    ])
    table.add_row([
        'OUT : x--ITEMS PUT IN WITH TITLE AS CONTENT--x', '[Q] (P+Q=X)',
        gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_NO_CONTENT
    ])
    table.add_row([
        'TIME TAKEN - CONTENT FORMATTING (min)', '-',
        round((endTime - startTime) / 60, 5)
    ])
    pc.printSucc(table)

    print("\n")
    pc.printWarn(
        '\t\t\t------------------------->>>>>> [ TimeTaken for Content Formatting (min) = {} ]\n'
        .format(round((endTime - startTime), 5) / 60))
    print("\n\n")
def RunSync(ts):
    """
        NOTE: pdf pages taking a lot of time.Is it right to scrape them still?
    """
    startTime = time.time()
    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    conn = sqlite3.connect(wc_db)
    c = conn.cursor()
    pc.printMsg(
        "\t -------------------------------------- < CONTENT_SCRAPER_SYNC: DB/wc Connection Opened > ---------------------------------------------\n"
    )

    blob_pages = ['.jpg', '.png', '.gif', '.mp3', '.mp4']

    q = "select * from " + wc_table + " where length(Content) = 0"
    rows_head = c.execute(q)
    rows = rows_head.fetchall()
    pc.printMsg(
        "\n\n \t ******************************* ITEMS FOR SYNC TO SCRAPE = {} ******************************\n\n"
        .format(len(rows)))
    conn.commit()
    for row in rows:
        t1 = time.time()
        if (len(row[13]) == 0):
            try:
                if row[6][-4:] not in blob_pages:
                    response = web_requests.hitGetWithRetry(
                        row[6], '', False, 2, 0.5, 30)
                    if response != -1:
                        gw.CS_SYNC_ITEM_SCRAPED += 1
                        res = response.text
                        row_list = list(row)
                        row_list[13] = res
                        row = tuple(row_list)

                        pc.printWarn(
                            "\t <ID = {}><src= {} > [SYNCED SCRAPED] Done................ \t\t TimeTaken = {} \t NOW: {} "
                            .format(
                                row[0], row[1], round((time.time() - t1), 5),
                                time.strftime("%H:%M:%S", time.localtime())))
                        q = 'update ' + wc_table + ' set Content = ? where ID = ? and SourceSite = ?'
                        d = (row[13], row[0], row[1])
                        c.execute(q, d)
                        conn.commit()
                        # pc.printSucc(" \t\t ============== <ID= {} ><{}> [SYNCED SCRAPED] INSERTED INTO TABLE =============== ".format(row[0],row[1]))
                    else:
                        gw.CS_SYNC_URL_UNREACHABLE += 1
                        pc.printErr(
                            "\t\tXXXXXXXXX [SYNCED SCRAPED]\t SKIPPING... <ID: {}> Totally unable to hit url even in SYNC: {}  \t\t TimeTaken = {} \t NOW: {} "
                            .format(
                                row[0], row[6], round((time.time() - t1), 5),
                                time.strftime("%H:%M:%S", time.localtime())))
                else:
                    pc.printMsg(
                        "\t\txxxxx [SYNCED SCRAPED]\t... for ID: {} Found BLOB page SYNC. Will use title. URL: {}  \t\t TimeTaken = {} \t NOW: {} "
                        .format(row[0], row[6], round((time.time() - t1), 5),
                                time.strftime("%H:%M:%S", time.localtime())))
            except Exception as e:
                gw.CS_SYNC_TRIED_CATCH_EXCEPTION_ERR += 1
                pc.printErr(
                    "\t XXXXXXXXXXXXXX [SYNC SCRAPING] XXXX ==>> <ID = {}><src= {} > NOW = {} , \t\t TimeTaken = {} ....Sync Scraping failed too.Will use Title for content... \n \t\t ERROR=> {}"
                    .format(row[0], row[1],
                            time.strftime("%H:%M:%S", time.localtime()),
                            round((time.time() - t1), 5), e))
                # logging.error(traceback.format_exc())
                pass
    endTime = time.time()
    conn.close()
    pc.printMsg(
        "\t -------------------------------------- < CONTENT_SCRAPER_SYNC: DB/wc Connection Closed > ---------------------------------------------\n"
    )

    pc.printSucc(
        "\n\n***************************** Sync Content Scraping is Complete. TABLE: {} ******************"
        .format(wc_table))
    print("\n\n")
    table = PrettyTable(
        ['Success (Post Sync Content Scraping)', 'Notation(if any)', 'Value'])
    table.add_row([
        'IN : gw.WC_TOTAL_URL_ENTRIES ', '[X] (A+B+C=X)',
        gw.WC_TOTAL_URL_ENTRIES
    ])
    table.add_row([
        'OUT : ITEMS SCRAPED WITH SYNC', '[C] (A+B+C=X)',
        gw.CS_SYNC_ITEM_SCRAPED
    ])
    table.add_row([
        'TIME TAKEN - SYNC CONTENT SCRAPING (min)', '-',
        round((endTime - startTime) / 60, 5)
    ])
    pc.printSucc(table)

    pc.printErr(
        "------------------------------------------ ERRORS-SYNC (Written nonetheless, chill) ------------------------------------------------\n"
    )
    table = PrettyTable(['Failures (Post Sync Content Scraping)', 'Value'])
    table.add_row(
        ['COUNT. UNREACHABLE URLS - SYNC ', gw.CS_SYNC_URL_UNREACHABLE])
    table.add_row([
        'COUNT. TRY/CATCHED EXCEP. - SYNC ',
        gw.CS_SYNC_TRIED_CATCH_EXCEPTION_ERR
    ])
    pc.printErr(table)
    print("\n")
    pc.printWarn(
        '\t\t\t------------------------->>>>>> [ TimeTaken for Sync Scraping (min) = {} ]\n'
        .format(round((endTime - startTime), 5) / 60))
    print("\n\n")
async def RunAsync(ts):
    """
        Does ASYNC_SERIES_CONNECTION times number of series executions in parallel
    """
    startTime = time.time()
    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    conn = sqlite3.connect(wc_db)
    """ get rows with content alredy present & put in gw.CS_ITEMS_WRITTEN_DIRECT .Will work just for 1st iteration"""
    c = conn.cursor()
    q = "select count(*) from " + wc_table + " where length(Content) != 0"
    no_scraping_needed_item_count = c.execute(q)
    no_scraping_needed_item_count = c.fetchone()[0]
    gw.CS_ITEMS_WRITTEN_DIRECT = no_scraping_needed_item_count

    conn.commit()
    # conn.close()
    # gw.SQL_CONN_OPEN -= 1

    for i in range(1, gw.ASYNC_SERIES_CONNECTION + 1):
        gw.CS_BOYS_STILL_PLAYING = 0
        pc.printMsg(
            "\n\n..........-------------\/\/\/------\/\/\/------\/\/\/---------------............  Running Async for {} -th time - \t Numer of Async-runs remaining: {} \t\t NOW: {}\n\n"
            .format(i, (gw.ASYNC_SERIES_CONNECTION - i),
                    time.strftime("%H:%M:%S", time.localtime())))
        # asyncio.get_event_loop().run_until_complete(asyncio.ensure_future(asyncFetchAll(ts,i)))
        await asyncFetchAll(conn, ts, i)
        pc.printMsg(
            "\t\t..........-------------\/\/\/------............  {} -th Async Running is done.Sleeping for 10 sec now......ZZZZZZZzzzzzzzzz\t\t NOW: {}\n\n"
            .format(i, time.strftime("%H:%M:%S", time.localtime())))
        time.sleep(10)

    conn.close()
    endTime = time.time()
    pc.printSucc(
        "\n\n***************************** All {} Async Content Scraping is Complete. TABLE: {} ******************"
        .format(gw.ASYNC_SERIES_CONNECTION, wc_table))
    print("\n\n")
    table = PrettyTable([
        'Success (Post ALL series Async Content Scraping)', 'Notation(if any)',
        'Value'
    ])
    table.add_row([
        'IN : gw.WC_TOTAL_URL_ENTRIES ', '[X] (A+B+C=X)',
        gw.WC_TOTAL_URL_ENTRIES
    ])
    table.add_row([
        'OUT : ITEMS WRITTEN DIRECT(no scraping needed) ', '[A] (A+B1+B2+C=X)',
        gw.CS_ITEMS_WRITTEN_DIRECT
    ])
    table.add_row([
        'OUT : ITEMS SCRAPED WITH ASYNC', '[B] (A+B+C=X)',
        gw.CS_ASYNC_ITEM_SCRAPED
    ])
    table.add_row([
        'TIME TAKEN - ASYNC CONTENT SCRAPING (min)', '-',
        round((endTime - startTime) / 60, 2)
    ])
    pc.printSucc(table)

    pc.printErr(
        "------------------------------------------ ERRORS-ASYNC (Written nonetheless, chill) ------------------------------------------------\n"
    )
    table = PrettyTable([
        'Failures (Counted as-in last run of Async Content Scraping)', 'Value'
    ])
    table.add_row(
        ['COUNT. UNREACHABLE URLS in ASYNC ', gw.CS_ASYNC_URL_UNREACHABLE])
    table.add_row([
        'COUNT. TRY/CATCHED SEMA EXCEP. in ASYNC ',
        gw.CS_ASYNC_SEMA_EXCEPTION_ERR
    ])
    pc.printErr(table)
    table.add_row([
        'TIME TAKEN FOR URL SCRAPING-r (min) ',
        round((endTime - startTime) / 60, 2)
    ])
    print("\n")
    pc.printWarn(
        '\t\t\t------------------------->>>>>> [ TimeTaken for All {} Sync Scraping (min) = {} ]\n'
        .format(gw.ASYNC_SERIES_CONNECTION,
                round((endTime - startTime), 5) / 60))
    print("\n\n")
async def asyncFetchAll(
        conn, ts,
        series_count):  #series_count : {1,gw.ASYNC_SERIES_CONNECTION}
    """
        just add the content into Content column, no cleaning OR weightedContent OR UrlString etc. here.
    """

    # wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    # conn = sqlite3.connect(wc_db)
    # gw.SQL_CONN_OPEN += 1
    c = conn.cursor()
    q = "select * from " + wc_table + " where length(Content) = 0"  # only get the rows without content
    rows_head = c.execute(q)
    rows = rows_head.fetchall()
    conn.commit()
    # conn.close()
    # gw.SQL_CONN_OPEN -= 1
    pc.printMsg(
        "\t -------------------------------------- < CONTENT_SCRAPER_ASYNC: DB/wc Connection Opened > ---------------------------------------------\n"
    )
    startTime = time.time()

    socket.gethostbyname("")
    connector = TCPConnector(limit=gw.CONNECTION_COUNT,
                             family=socket.AF_INET,
                             verify_ssl=False)
    pc.printMsg(
        "\n\n===================================================================== Doing {}-th Async Scraping in the same table =====================================================================\n\n"
        .format(series_count))
    async with ClientSession(headers={'Connection': 'keep-alive'},
                             connector=connector) as session:

        tasks = []
        sem = asyncio.Semaphore(gw.SEMAPHORE_COUNT)
        for row in rows:
            """
                ============= row is an array with indices: 
                ID(0),SourceSite(1),ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5),Url(6),
                SourceTags(7),ModelTags(8),NumUpvotes(9),NumComments(10),PopI(11),WeightedContent(12),Content(13)
            """
            t1 = time.time()
            if (row[5] and row[6]):  # else ignore the entry
                gw.CS_BOYS_STILL_PLAYING += 1
                if gw.CS_BOYS_STILL_PLAYING % gw.CS_BOYS_PLAYING_LIMIT == 0:
                    pc.printMsg(
                        "\t [ASYNC_SCRAPING] sleeping for 1 sec...zzzzzzzzz....... \t BOYS_STILL_PLAYING = {}"
                        .format(gw.CS_BOYS_STILL_PLAYING))
                    time.sleep(1)
                # task = asyncio.ensure_future(semaphoreSafeFetch(sem, row, session,series_count))
                task = asyncio.ensure_future(
                    semaphoreSafeFetch(conn, sem, row, session, series_count,
                                       ts))
                tasks.append(task)

        await asyncio.gather(*tasks)
        # responses = await asyncio.gather(*tasks)
        # for row in responses:
        #     if row and len(row[13]) >0:
        #         try:
        #             content = row[13]
        #             conn = sqlite3.connect(wc_db)
        #             c = conn.cursor()
        #             q = 'update ' + wc_table + ' set Content = ? where ID = ? and SourceSite = ?'
        #             d = (content,row[0],row[1])
        #             c.execute(q,d)
        #             pc.printSucc(" \t\t ============== <ID= {} ><{}> [ASYNC ContentScraped] \t INSERTED INTO TABLE =============== ".format(row[0],row[1]))
        #             conn.commit()
        #             conn.close()
        #         except Exception as e:
        #             logging.error(traceback.format_exc())
        #         pass
        # succs = []
        # for row in responses:
        #     if row and len(row[13]) >0:
        #         succ = asyncio.ensure_future(semaphoreSqlUpdate(sem,row,ts))
        #         succs.append(succ)
        # succsx = await asyncio.gather(*succs)

    endTime = time.time()

    pc.printSucc(
        "\n***************************** {} -th Async Content Scraping is Complete. TABLE: {} ******************"
        .format(series_count, wc_table))
    print("\n\n")
    table = PrettyTable(
        ['Success (Post Async Content Scraping)', 'Notation(if any)', 'Value'])
    table.add_row([
        'OUT : TOTAL ITEMS SCRAPED WITH ASYNC YET', '[B] (A+B+C=X)',
        gw.CS_ASYNC_ITEM_SCRAPED
    ])
    pc.printSucc(table)
    print("\n")
    pc.printWarn(
        '\t\t\t------------------------->>>>>> [ TimeTaken for Async Scraping (min) = {} ]\n'
        .format(round((endTime - startTime), 5) / 60))
    print("\n\n")
Exemple #18
0
def run(ts):
    """
        Scrapes Algolia's HN api for last 7 days & puts data in WC-DB.
            * max number of entries in algolia's single api call = 1000. So scrape for one day at a time
            * Link to documentation: https://hn.algolia.com/api
        Note:
            1. For AskHN entries put `` tag & separate threshold
            1. For ShowHN entries put `` tag & separate threshold
            1. For Jobs@HN entries put `` tag => later as these entries dont have upvotes/comments
        Input: ts (format: 1598692058.887741)
    """
    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    pc.printSucc('@[{}] >>>>>> Started HN-scraper ................... => TABLE: {}\n'.format(datetime.fromtimestamp(ts),wc_table))
    conn = sqlite3.connect(wc_db, timeout=10)
    c = conn.cursor()
    pc.printMsg("\t -------------------------------------- < HN_SCRAPER: DB/wc Connection Opened > ---------------------------------------------\n")
    startTime = time.time()

    """
        here is how you add day to `ts`:

        from datetime import datetime, timedelta
        newts = datetime.fromtimestamp(ts) + timedelta(days=1) # 2020-08-30 16:02:34.352094
        newts.timestamp() # 1598783633.284871
        datetime.fromtimestamp(ts) #2020-08-29 17:15:32
    """

    """ ts_arr has last 7 days(including today's) (non-decimal stype)timestamps strings 
        TIP: use `datetime.fromtimestamp(int(t))` to convert to human readable format
    """
    ts_arr = [str(int(ts))]

    for i in range(6):
        new_ts = datetime.fromtimestamp(int(ts_arr[-1])) + timedelta(days=-1)
        new_ts = new_ts.timestamp()
        ts_arr.append(str(int(new_ts)))

    # for t in ts_arr:
    #     print("timestamp: {} \t date: {}".format(t,datetime.fromtimestamp(int(t))))

    index = gw.WC_TOTAL_URL_ENTRIES + 1

    for i in range(len(ts_arr)-1):
        startepoch = ts_arr[i]
        endepoch   = ts_arr[i+1]
        pc.printMsg(" ................. scraping for interval: start= {} -> end = {} .................\n".format(startepoch,endepoch))
        
        """ 
            getting stories(articles) with upvotes_count > upvotes_threshold 
            Also including:
                1. TellHN (<tech_discuss>)
                2. LaunchHN (<startup>)
        """
        pc.printWarn(" \t............. scraping stories .............")
        try:
            url_story = 'http://hn.algolia.com/api/v1/search_by_date?tags=story&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_STORY_UPVOTE_TH)
            data = web_requests.hitGetWithRetry(url_story)
            res_size = json.loads(data.content)["nbHits"]

            pc.printMsg("\t\t\t\t====> Item count: {}".format(res_size))

            gw.HN_TOTAL_ITEMS_GOT_YET += res_size
            items_arr = json.loads(data.content)["hits"]

            for item in items_arr:
                url = 'https://news.ycombinator.com/item?id='+str(item["objectID"])
                sourceTag = ''
                content = ''
                sourceSite = 'HN'
                if(item["url"] is None): #as all ShowHNs may not have an url ...hihi...
                    # print( '------------------------- found null urled value ---------------------\n-----[STORY]url: {}'.format(url))
                    # print(json.dumps(item, indent = 4))
                    if(item["story_text"] is not None):
                        content = text_actions.getTextFromHtml(item["story_text"])
                    if("Launch HN:" in item["title"]):                                    # 1. LaunchHN
                        sourceTag = 'startup'
                        sourceSite += '/launch'
                    if("Tell HN:" in item["title"]):                                      # 2. TellHN
                        sourceTag = 'tech_discuss'
                        sourceSite += '/tell'
                else:
                    url = item["url"] 
                entry = [
                    index,
                    sourceSite,
                    datetime.fromtimestamp(ts).date(),
                    int(ts),
                    date_conversion.HNDate(str(item["created_at"])),
                    item["title"],              
                    url,
                    sourceTag,
                    '',
                    item["points"],
                    item["num_comments"],
                    '',
                    '',
                    text_actions.clean_text(content)
                    ]
                c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
                index=index+1

            pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET))
        except Exception as e:
            pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e))
            logging.error(traceback.format_exc())
            pass

        """ getting ShowHNs """
        pc.printWarn("\t............. scraping showHNs .............")
        try:
            url_show = 'http://hn.algolia.com/api/v1/search_by_date?tags=show_hn&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_SHOWHN_UPVOTE_TH)
            data = web_requests.hitGetWithRetry(url_show)
            res_size = json.loads(data.content)["nbHits"]

            pc.printMsg("\t\t\t\t====> Item count: {}".format(res_size))
            
            gw.HN_TOTAL_ITEMS_GOT_YET += res_size
            items_arr = json.loads(data.content)["hits"]

            for item in items_arr:
                content = ''
                sourceSite = 'HN/show'
                if(item["url"] is None): #as all ShowHNs may not have an url ...hihi...
                    url = 'https://news.ycombinator.com/item?id='+str(item["objectID"])
                    # print( '-------------------------- found null urled value ---------------------\n-----[SHOW]url: {}'.format(url))
                    # print(json.dumps(item, indent = 4))
                    if(item["story_text"] is not None):
                        content = text_actions.getTextFromHtml(item["story_text"])
                else:
                    url = item["url"] 
                entry = [
                    index,
                    sourceSite,
                    datetime.fromtimestamp(ts).date(),
                    int(ts),
                    date_conversion.HNDate(str(item["created_at"])),
                    item["title"],              
                    url,
                    'sideproj',
                    '',
                    item["points"],
                    item["num_comments"],
                    '',
                    '',
                    text_actions.clean_text(content)
                    ]
                c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
                index=index+1

            pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET))
        except Exception as e:
            pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e))
            logging.error(traceback.format_exc())
            pass


        """ getting AskHNs """

        pc.printWarn("\t............. scraping askHNs .............")
        try:
            url_ask = 'http://hn.algolia.com/api/v1/search_by_date?tags=ask_hn&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_ASKHN_UPVOTE_TH)
            data = web_requests.hitGetWithRetry(url_ask)
            res_size = json.loads(data.content)["nbHits"]

            pc.printWarn("\t\t\t\t====> Item count: {}".format(res_size))

            gw.HN_TOTAL_ITEMS_GOT_YET += res_size
            items_arr = json.loads(data.content)["hits"]
            

            for item in items_arr:
                content = ''
                sourceSite = 'HN/ask'
                if(item["url"] is None): #as AskHNs dont have any url ...hihi...
                    url = 'https://news.ycombinator.com/item?id='+str(item["objectID"])
                    # print( '-------------------------- found null urled value ---------------------\n-----[ASK]url: {}'.format(url))
                    # print(json.dumps(item, indent = 4))
                    if(item["story_text"] is not None):
                        content = text_actions.getTextFromHtml(item["story_text"])
                else:
                    url = item["url"] 
                entry = [
                    index,
                    sourceSite,
                    datetime.fromtimestamp(ts).date(),
                    int(ts),
                    date_conversion.HNDate(str(item["created_at"])),
                    item["title"],              
                    url,
                    'prog_query',
                    '',
                    item["points"],
                    item["num_comments"],
                    '',
                    '',
                    text_actions.clean_text(content)
                    ]
                c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
                index=index+1
            pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET))
        except Exception as e:
            pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e))
            logging.error(traceback.format_exc())
            pass

    endTime = time.time()
    conn.commit()
    conn.close()
    gw.WC_TOTAL_URL_ENTRIES += gw.HN_TOTAL_ITEMS_GOT_YET
    pc.printMsg("\t -------------------------------------- < HN_SCRAPER: DB/wc Connection Closed > ---------------------------------------------\n")

    pc.printSucc("\n\n***************************** HN Url Scraping is Complete. TABLE: {} ******************".format(wc_table))
    print("\n\n")
    table = PrettyTable(['Entity (Post HN URL Scraping)', 'Value'])
    table.add_row(['TOTAL URLS FETCHED by HN', gw.HN_TOTAL_ITEMS_GOT_YET])
    table.add_row(['TOTAL ITEMS IN WC TABLE YET', gw.WC_TOTAL_URL_ENTRIES])
    table.add_row(['TIME TAKEN FOR URL SCRAPING-HN (sec) ', round((endTime - startTime),5)])
    pc.printSucc(table)
    print("\n\n")
async def fetchWithRetry(conn, row, session, series_count, ts):
    """
        Hits ulr(with retires):
        * if status == 200:
            return resposne ((raw)Content & (raw)WeightedContent in row)
        * if still unable to hit after retries: Content = Title , WeightedContent = Title
        INPUT: `row` is an array with indices: 
            ID(0),SourceSite(1),ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5),Url(6),
            SourceTags(7),ModelTags(8),NumUpvotes(9),NumComments(10),PopI(11),WeightedContent(12),Content(13)
    """

    status = 400
    retry_cnt = 2
    sleep_time = 0.1

    t1 = time.time()
    while retry_cnt > 0 and status != 200:
        async with session.get(row[6],
                               ssl=ssl.create_default_context(
                                   purpose=ssl.Purpose.CLIENT_AUTH),
                               timeout=gw.CS_ASYNC_REQ_TIMEOUT) as response:
            # res = await response.content.read()       # returns blob which gives error while ContentFormatter; hence discarded
            res = await response.text()
            status = response.status
            if (status == 200 and len(res) != 0):
                gw.CS_ASYNC_ITEM_SCRAPED += 1
                gw.CS_BOYS_STILL_PLAYING -= 1
                pc.printSucc(
                    "\t\t <ID = {}><src= {} > ============== [ASYNCED SCRAPED#{}] Done ....... \t\t TimeTaken = {} \t NOW: {}"
                    .format(row[0], row[1], series_count,
                            round((round((time.time() - t1), 5)), 5),
                            time.strftime("%H:%M:%S", time.localtime())))
                row_list = list(row)
                row_list[13] = res
                row = tuple(row_list)

                # wc_db = 'dbs/wc.db'
                wc_table = 'wc_' + str(int(ts))
                # conn = sqlite3.connect(wc_db)
                # gw.SQL_CONN_OPEN += 1
                try:
                    c = conn.cursor()
                    q = 'update ' + wc_table + ' set Content = ? where ID = ? and SourceSite = ?'
                    d = (row[13], row[0], row[1])
                    c.execute(q, d)
                    pc.printWarn(
                        " \t\t ============== <ID= {} ><{}> [ASYNC ContentScraped] \t INSERTED INTO TABLE :: gw.SQL_CONN_OPEN = {} =============== "
                        .format(row[0], row[1], gw.SQL_CONN_OPEN))
                    conn.commit()
                except Exception as e:
                    pc.printMsg(
                        " \t\t === XXXX ====== <ID= {} ><{}> [ASYNC ContentScraped] \t ERRR in INSERTED INTO TABLE :: gw.SQL_CONN_OPEN = {} =============== "
                        .format(row[0], row[1], gw.SQL_CONN_OPEN))
                    logging.error(traceback.format_exc())
                    pass
                # conn.close()
                # gw.SQL_CONN_OPEN -= 1

                return row
            else:
                retry_cnt -= 1
                pc.printWarn(
                    "\t x---------------- <ID = {}><src= {} > Unable to hit URL(ERR_CODE={}): {}.........  Sleeping for {} Retries remaining = {} -------------x"
                    .format(row[0], row[1], status, row[6][:25], sleep_time,
                            retry_cnt))
                await asyncio.sleep(sleep_time)
    if series_count == gw.ASYNC_SERIES_CONNECTION:
        gw.CS_ASYNC_URL_UNREACHABLE += 1
        pc.printErr(
            "\t\txxxxx  For <ID = {}><src= {} >Totally unable to hit url.... Will try sync later: {} \t\t TimeTaken = {} \t NOW: {}"
            .format(row[0], row[1], row[6], round((time.time() - t1), 5),
                    time.strftime("%H:%M:%S", time.localtime())))
    # return row
    return []
Exemple #20
0
def run(ts):
    """ I. Creates wc_table(in wc.db) & wp_table(in wp.dp) for the week
        II. Runs following scrapers serially and updates them in WC-DB:
            1. hn_scraper.py
            2. r_scraper.py
            4. ph_scraper.py => Api exists, Scraping not allowed(doint it anyway)
            3. ih_scraper.py => No Api, Scraping not allowed(postponed for later)

        Input: float(timestamp) - set when the main.py run is triggered
            * float because o/w `datetime.fromtimestamp(ts)` wont run on int
        Outpu: None, just put data in WC-DB
    """
    startTime = time.time()
    """ Initialize the weekly content tables in wc.db and wp.db"""

    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    conn = sqlite3.connect(wc_db, timeout=10)
    c = conn.cursor()
    c.execute(
        "SELECT count(name) FROM sqlite_master WHERE type='table' AND name='{}'"
        .format(wc_table))
    if c.fetchone()[0] == 1:  # table exists, flush away!
        c.execute("delete from {}".format(wc_table))
    else:  # creting new table
        c.execute(
            "CREATE TABLE {} (ID, SourceSite, ProcessingDate,ProcessingEpoch,CreationDate, Title, Url, SourceTags,ModelTags,NumUpvotes, NumComments, PopI,WeightedContent,Content)"
            .format(wc_table))

    pc.printSucc(
        "\n**************************************************** wc_table created => {} **************************************************** \n"
        .format(wc_table))

    wp_db = 'dbs/wp.db'
    wp_table = 'wp_' + str(int(ts))
    conn = sqlite3.connect(wp_db, timeout=10)
    c = conn.cursor()
    c.execute(
        "SELECT count(name) FROM sqlite_master WHERE type='table' AND name='{}'"
        .format(wp_table))
    if c.fetchone()[0] == 1:  # table exists, flush away!
        c.execute("delete from {}".format(wc_table))
    else:  # creting new table
        c.execute('''CREATE TABLE {}
                (ID, SourceSite, ProcessingDate,ProcessingEpoch,CreationDate, Title, Url, ThumbnailUrl,SourceTags,NumUpvotes, NumComments, PopI,Content)'''
                  .format(wp_table))

    pc.printSucc(
        "\n**************************************************** wp_table created => {} **************************************************** \n"
        .format(wp_table))
    """ Run the scrapers sequentially """
    pc.printWarn(
        ".   .   .   .   .   .   .   .   .   .   .   .   .   .   .   ...... Started Running all the scrapers ......    .   .   .   .   .   .   .   .   .   .   .   .   .   .   .\n"
    )

    try:
        hn_scraper.run(ts)
        pc.printSucc(
            "\n================ HH url scraper run: Complete ================\n"
        )
    except Exception as e:
        pc.printErr(
            " xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running Url Scraper-HN xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\t>>> Error = {}"
            .format(str(e)))
        logging.error(traceback.format_exc())
        pass

    try:
        r_scraper.run(ts)
        pc.printSucc(
            " \n================ Reddit url scraper run: Complete ================\n"
        )
    except Exception as e:
        pc.printErr(
            " xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running Url Scraper-Reddit xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\tError = {}"
            .format(str(e)))
        logging.error(traceback.format_exc())
        pass

    try:
        ph_scraper.run(ts)
        pc.printSucc(
            " \n================ PH url scraper run: Complete ================\n"
        )
    except Exception as e:
        pc.printErr(
            " xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running Url Scraper-PH xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\tError = {}"
            .format(str(e)))
        logging.error(traceback.format_exc())
        pass

    # try:
    #     ih_scraper.run(ts)
    #     print(" \n====== IH url scraper run: Complete ======\n")
    # except Exception as e:
    #     print(" XXXXXXXXXXXX Error in scraping IH for url XXXXXXXXXXXXXXXXX \n \t\tError = {}".format(str(e)))
    #     pass

    #TODO: add Lobsters here

    endTime = time.time()
    pc.printSucc(
        " ********************************************** URL Scraping(HN,r,PH) is complete *******************************************\n"
    )
    print("\n\n")
    table = PrettyTable(['Entity (Post all URL Scraping)', 'Value'])
    table.add_row(['TOTAL URL ITEMS IN WC TABLE ', gw.WC_TOTAL_URL_ENTRIES])
    table.add_row([
        'TIME TAKEN FOR URL SCRAPING-All (min) ',
        round((endTime - startTime) / 60, 2)
    ])
    pc.printSucc(table)
    print("\n\n")
Exemple #21
0
def run(ts):
    """
        Get top 1000 submissions of the listed subreddits (max_limit is 1000; should be enough)
        Hence no use of `ts` here
    """
    startTime = time.time()
    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    pc.printSucc(
        '@[{}] >>>>>> Started r-scraper ................... => TABLE: {}\n'.
        format(datetime.fromtimestamp(ts), wc_table))
    pc.printMsg(
        "\t -------------------------------------- < r_SCRAPER: DB/wc Connection Opened > ---------------------------------------------\n"
    )
    conn = sqlite3.connect(wc_db, timeout=10)
    c = conn.cursor()

    blob_pages = ['.jpg', '.png', '.gif', '.mp3',
                  '.mp4']  # these give blob data; no point in scraping them

    index = gw.WC_TOTAL_URL_ENTRIES + 1

    # Setup Client
    reddit = praw.Reddit(
        client_id=vault.R_CLIENT_ID,  # PERSONAL_USE_SCRIPT_14_CHARS
        client_secret=vault.R_CLIENT_SECRET,  # SECRET_KEY_27_CHARS
        user_agent=vault.R_USER_AGENT,  # YOUR_APP_NAME
        username=vault.R_USERNAME,  # YOUR_REDDIT_USER_NAME
        password=vault.R_PASSWORD)  # YOUR_REDDIT_LOGIN_PASSWORD

    for subreddit, tag_arr in LIST.items():
        try:
            pc.printWarn(
                "\t ............  Subreddit@R_UrlScraping : {}  .............".
                format(subreddit))
            sr = reddit.subreddit(subreddit)
            # for submission in sr.top('day',limit=10):                   # For testing....
            # for submission in sr.top('year',limit=1000):                #remove this & uncomemnt below line
            ENTRIES_IN_THIS_SUBRDDIT = 0
            for submission in sr.top('week',
                                     limit=gw.R_ITEM_LIMIT_PER_SUBREDDIT
                                     ):  #NOTE: max limit is 1000
                #Check1: if the post is unlocked by mods
                content = ''
                """ Fixing permalink type urls """
                url = submission.url
                if (url[:2] == '/r'):
                    url = "https://www.reddit.com" + url
                if (submission.locked == False):
                    #Check2: if post is just an image, discard it
                    if submission.url[
                            -4:] not in blob_pages:  #as reddit currentluy hosts .png & .gif only
                        # if permalink is a substring of url OR submission is a selfpost (text-only) => no need to scrape
                        # NOTE: I know there might be links in post with some discription+link to other article he's reffering; but not worth wasting precious processing time
                        if ((submission.permalink in submission.url)
                                or (submission.is_self == True)):
                            content = submission.selftext
                        entry = [
                            index, "r/" + subreddit,
                            datetime.fromtimestamp(ts).date(),
                            int(ts),
                            date_conversion.RedditDate(
                                str(datetime.fromtimestamp(
                                    submission.created))), submission.title,
                            url,
                            json.dumps(tag_arr), '', submission.score,
                            submission.num_comments, '', '',
                            text_actions.clean_text(content)
                        ]
                        # csv_functions.putToCsv(csv_file,entry)
                        c.execute(
                            'INSERT INTO ' + wc_table +
                            ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
                        index += 1
                        ENTRIES_IN_THIS_SUBRDDIT += 1
            gw.R_TOTAL_ITEMS_GOT_YET += ENTRIES_IN_THIS_SUBRDDIT
            pc.printMsg(
                "\t\t\t\t\t ====> ENTRIES_IN_THIS_SUBRDDIT = {} \t\t |  \t gw.R_TOTAL_ITEMS_GOT_YET = {}"
                .format(ENTRIES_IN_THIS_SUBRDDIT, gw.R_TOTAL_ITEMS_GOT_YET))
        except Exception as e:
            pc.printErr(
                " \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n"
                .format(index, e))
            logging.error(traceback.format_exc())
            pass

    endTime = time.time()
    gw.WC_TOTAL_URL_ENTRIES += gw.R_TOTAL_ITEMS_GOT_YET

    conn.commit()
    conn.close()
    pc.printMsg(
        "\t -------------------------------------- < r_SCRAPER: DB/wc Connection Closed > ---------------------------------------------\n"
    )
    pc.printSucc(
        "\n\n***************************** Reddit Url Scraping is Complete. TABLE: {} ******************"
        .format(wc_table))
    print("\n\n")
    table = PrettyTable(['Entity (Post r URL Scraping)', 'Value'])
    table.add_row(['TOTAL URLS FETCHED by HN', gw.R_TOTAL_ITEMS_GOT_YET])
    table.add_row(['TOTAL ITEMS IN WC TABLE YET', gw.WC_TOTAL_URL_ENTRIES])
    table.add_row([
        'TIME TAKEN FOR URL SCRAPING-r (min) ',
        round((endTime - startTime) / 60, 2)
    ])
    pc.printSucc(table)
    print("\n\n")
def run(ts):
    nest_asyncio.apply()  #  to be able to run async loop from aj async loop

    wc_table = 'wc_' + str(int(ts))
    pc.printMsg(
        '@[{}] >>>>>> Started Content-scraper(ASYNC) .......[Sema = {}, conn_lim ={}]............ => TABLE: {}\n'
        .format(datetime.fromtimestamp(ts), gw.SEMAPHORE_COUNT,
                gw.CONNECTION_COUNT, wc_table))

    startTime = time.time()
    """ scrape content in async """
    asyncio.get_event_loop().run_until_complete(
        asyncio.ensure_future(RunAsync(ts)))
    time.sleep(10)
    """ scrape remaining items with sync """
    RunSync(ts)
    """ formatting everything in the end-done in sync """
    time.sleep(10)
    ContentFormatting(ts)

    endTime = time.time()
    pc.printSucc(
        "\n\n\n\n\n****************** Content Scraping is Complete , TABLE: {} ********************"
        .format(wc_table))
    print("\n\n")
    table = PrettyTable(
        ['Entities (Post Content Scraping-all)', 'Notation(if any)', 'Value'])

    table.add_row([
        'IN : gw.WC_TOTAL_URL_ENTRIES ', '[X] (A+B+C=X)',
        gw.WC_TOTAL_URL_ENTRIES
    ])
    table.add_row([
        'CS_OUT : ITEMS SCRAPED WITH ASYNC', '[A] (A+B+C=X)',
        gw.CS_ASYNC_ITEM_SCRAPED
    ])
    table.add_row([
        'CS_OUT : ITEMS WRITTEN DIRECT(no scraping needed) ', '[B] (A+B+C=X)',
        gw.CS_ITEMS_WRITTEN_DIRECT
    ])
    table.add_row([
        'CS_OUT : ITEMS SCRAPED WITH SYNC', '[C] (A+B+C=X)',
        gw.CS_SYNC_ITEM_SCRAPED
    ])
    table.add_row([
        'CF_OUT : ITEMS PUT IN WITH SCRAPED CONTENT', '[P] (P+Q=X)',
        gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_OK
    ])
    table.add_row([
        'CF_OUT : x--ITEMS PUT IN WITH TITLE AS CONTENT--x', '[Q] (P+Q=X)',
        gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_NO_CONTENT
    ])

    pc.printSucc(table)

    pc.printErr(
        "\n\n------------------------------------------ ERRORS (Written nonetheless, chill) ------------------------------------------------\n"
    )
    table = PrettyTable(['Failures (Post Content Scraping-all)', 'Value'])
    table.add_row(
        ['COUNT. UNREACHABLE URLS - ASYNC ', gw.CS_ASYNC_URL_UNREACHABLE])
    table.add_row([
        'COUNT. TRY/CATCHED SEMA EXCEP. - ASYNC ',
        gw.CS_ASYNC_SEMA_EXCEPTION_ERR
    ])
    table.add_row(
        ['COUNT. UNREACHABLE URLS - SYNC ', gw.CS_SYNC_URL_UNREACHABLE])
    table.add_row([
        'COUNT. TRY/CATCHED EXCEP. - SYNC ',
        gw.CS_SYNC_TRIED_CATCH_EXCEPTION_ERR
    ])
    pc.printErr(table)
    print("\n")
    pc.printWarn(
        '\t\t\t\t------------------------->>>>>> [ Time Taken(min) = {} ]\n\n\n\n\n\n'
        .format(round((endTime - startTime), 5) / 60))
    print("\n\n\n\n")
async def asyncFetchAll(ts):
    """
        INPUT: ts (format: 1598692058.887741)
    """
    global CONNTECTION_COUNT, SEMAPHORE_COUNT

    tasks = []
    sem = asyncio.Semaphore(SEMAPHORE_COUNT)

    #==========================init connection
    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    conn = sqlite3.connect(wc_db, timeout=10)
    c = conn.cursor()
    pc.printMsg(
        "\t -------------------------------------- < CONTENT_SCRAPER: DB Connection Opened > ---------------------------------------------\n"
    )
    stratTime = time.time()

    # """ Initialize the output file """
    # headers = ['ID', 'SourceSite', 'ProcessingDate','ProcessingEpoch','CreationDate', 'Title', 'Url', 'SourceTags','ModelTags','NumUpvotes', 'NumComments', 'PopI','WeightedContent','Content']
    # csv_functions.creteCsvFile(csv_out,headers)

    global ENTRIES_TO_BE_WRITTEN
    global WRITTEN_ENTRIES_ASYNC_SCRAPED
    global WRITTEN_ENTRIES_ASYNC_DIRECT
    global ASYNC_ENTRIES_TO_BE_SCRAPED

    connector = TCPConnector(limit=CONNTECTION_COUNT,
                             family=socket.AF_INET,
                             verify_ssl=False)
    # connector = TCPConnector(limit=CONNTECTION_COUNT)
    # connector = ProxyConnector.from_url('http://*****:*****@127.0.0.1:1080')
    async with ClientSession(headers={'Connection': 'keep-alive'},
                             connector=connector) as session:
        q = "select * from " + wc_table
        rows_head = c.execute(q)
        rows = rows_head.fetchall()
        for row in rows:
            """
                ============= row is an array with indices: 
                ID(0),SourceSite(1),ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5),Url(6),
                SourceTags(7),ModelTags(8),NumUpvotes(9),NumComments(10),PopI(11),WeightedContent(12),Content(13)
            """
            ENTRIES_TO_BE_WRITTEN += 1
            if (len(row[13]) != 0):
                pc.printWarn(
                    "\t <ID = {}><src= {} > [NO SCRAPING] Content already exists............... NOW: {}"
                    .format(row[0], row[1],
                            time.strftime("%H:%M:%S", time.localtime())))
                clean_content = row[13]  #Already cleaned in url_scraper
                url_strings_content = getUrlString(row[13])
                clean_title = clean_text(row[5])
                clean_weighted_content = clean_text(
                    row[12]) + " " + clean_title + " " + url_strings_content

                query = 'update ' + wc_table + ' set Content = ? , WeightedContent = ? where ID = ? and SourceSite = ?'
                data = (clean_content, clean_weighted_content, row[0], row[1])
                c.execute(query, data)
                WRITTEN_ENTRIES_ASYNC_DIRECT += 1
                pc.printSucc(
                    " \t\t ============== <ID= {} ><{}> [Direct] INSERTED INTO TABLE =============== "
                    .format(row[0], row[1]))
            elif (row[5] and row[6]):  # else ignore the entry
                ASYNC_ENTRIES_TO_BE_SCRAPED += 1
                print("\t\t\t\t\t SENT...... SENT_COUNT = {}".format(
                    ASYNC_ENTRIES_TO_BE_SCRAPED))
                # if(ASYNC_ENTRIES_TO_BE_SCRAPED%100 == 0):
                #     pc.printMsg("\t\t\t.......................zzzzzzzzzzzzzzzzzzzzzzzzzzzzzz <NAP TIME> for 5 sec After 100 async-requests while content scraping #ZarooriHaiJi zzzzzzzzzzzzzzz.......................")
                #     time.sleep(5)
                task = asyncio.ensure_future(
                    semaphoreSafeFetch(sem, row, session))
                tasks.append(task)

        responses = await asyncio.gather(*tasks)
        for row in responses:
            if row:
                clean_content = clean_text(row[13])
                url_strings_content = getUrlString(row[13])
                clean_title = clean_text(row[5])
                clean_weighted_content = clean_text(
                    row[12]) + " " + clean_title + " " + url_strings_content
                query = 'update ' + wc_table + ' set Content = ? , WeightedContent = ? where ID = ? and SourceSite = ?'
                data = (clean_content, clean_weighted_content, row[0], row[1])
                c.execute(query, data)
                WRITTEN_ENTRIES_ASYNC_SCRAPED += 1
                pc.printSucc(
                    " \t\t ============== <ID= {} ><{}> [Scraped] INSERTED INTO TABLE =============== "
                    .format(row[0], row[1]))

    endTime = time.time()
    conn.commit()
    conn.close()
    pc.printMsg(
        "\t -------------------------------------- < CONTENT_SCRAPER: DB Connection Closed > ---------------------------------------------\n"
    )