Python printWarn Exemples, utilities.print_in_color.printWarn Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : popi_calculator.py Projet : aayush4vedi/CynicalReader

def run(ts):
    startTime = time.time()

    try:
        run_wc(ts)
    except Exception as e:
        pc.printErr(" xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running PopICalculator for wc table xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\t>>> Error = {}".format(str(e)))
        logging.error(traceback.format_exc())
        pass

    try:
        run_wp(ts)
    except Exception as e:
        pc.printErr(" xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running PopICalculator for wc table xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\t>>> Error = {}".format(str(e)))
        logging.error(traceback.format_exc())
        pass
    
    endTime = time.time()

    pc.printSucc("**************************** PopI Calculation is Done for wc & wp ********************************\n\n")
    pc.printWarn("| \t\t TIME TAKEN FOR PopICalculators-both     \t\t | \t\t {}  \t\t |".format(round((endTime - startTime),5)))
    pc.printSucc("*************************************************************************************************\n\n")

    pc.printSucc("\n\n***************************** PopI Calculation is Complete.************************")
    print("\n\n")
    table = PrettyTable(['Entity (Post PopI Calculation)', 'Value'])
    table.add_row(['TIME TAKEN FOR PopICalculators(wc & wp) (min)', round((endTime - startTime)/60,2)])
    pc.printSucc(table)
    print("\n\n")

Exemple #2

0

Afficher le fichier

def create_test_table(n):  # similar to wc_1599816944 (the url-only table)
    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(n)
    conn = sqlite3.connect(wc_db, timeout=10)
    c = conn.cursor()
    # c.execute("create table " + wc_table + " as select * from wc_1601413857")   #url table
    c.execute("create table " + wc_table +
              " as select * from wc_1601511004")  #full content table
    pc.printWarn(
        "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ Created test table in dc.db => {} @@@@@@@@@@@@@@@@@@@@@@@@@@@@"
        .format(wc_table))

Exemple #3

0

Afficher le fichier

Fichier : content_scraper_failed_csv.py Projet : aayush4vedi/CynicalReader

async def asyncFetchAll(csv_in, csv_out):
    """
        INPUT: csv_src_file & csv_dest_file(to be written)
        NOTE: 
            * Semaphore limit is: 500
            * While writing the response to csv_dest_file, it is done in chunks of `N` entries at a time
    """

    tasks = []
    sem = asyncio.Semaphore(5)
    """ Initialize the output file """
    headers = [
        'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch',
        'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags',
        'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content'
    ]
    csv_functions.creteCsvFile(csv_out, headers)

    connector = TCPConnector(limit=10)
    async with ClientSession(headers={'Connection': 'keep-alive'},
                             connector=connector) as session:
        with open(csv_in, mode='r') as csvfile:
            csv_reader = csv.DictReader(csvfile)
            global ENTRIES_TO_BE_WRITTEN
            for row in csv_reader:
                ENTRIES_TO_BE_WRITTEN += 1
                if (len(row["Content"]) != 0):
                    pc.printWarn(
                        "\t <ID = {}><src= {} > [NO SCRAPING] Content already exists............... NOW: {}"
                        .format(row["ID"], row["SourceSite"],
                                time.strftime("%H:%M:%S", time.localtime())))
                    row["WeightedContent"] = row["Title"] + row[
                        "WeightedContent"]
                    row["Content"] = row["Content"]
                    await write_result(csv_out, row)
                    global WRITTEN_ENTRIES_ASYNC_DIRECT
                    WRITTEN_ENTRIES_ASYNC_DIRECT += 1
                    pc.printMsg(
                        " \t\t ==============  Done Writing into csv for <ID = {}><src= {} >=============== "
                        .format(row["ID"], row["SourceSite"]))
                elif (row["Url"] and row["Title"]):
                    task = asyncio.ensure_future(
                        semaphoreSafeFetch(sem, row, session, csv_out))
                    tasks.append(task)

        responses = await asyncio.gather(*tasks)
        pc.printMsg(
            "\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ Total items to actually scrape(found w/o Content) = {} @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n"
            .format(len(responses)))

Exemple #4

0

Afficher le fichier

Fichier : content_scraper.py Projet : aayush4vedi/CynicalReader

async def semaphoreSafeFetch(conn, sem, row, session, series_count, ts):
    """
        Simply puts semaphore limit on async-fetch
    """

    # async with sem:
    #     try:
    #         row = await fetchWithRetry(row, session,series_count)
    #         if row and len(row[13]) >0:
    #             # await semaphoreSqlUpdate(row,ts)
    #             wc_db = 'dbs/wc.db'
    #             wc_table = 'wc_' + str(int(ts))
    #             content = row[13]
    #             conn = sqlite3.connect(wc_db)
    #             gw.SQL_CONN_OPEN += 1
    #             c = conn.cursor()
    #             q = 'update ' + wc_table + ' set Content = ? where ID = ? and SourceSite = ?'
    #             d = (content,row[0],row[1])
    #             c.execute(q,d)
    #             pc.printWarn(" \t\t ============== <ID= {} ><{}> [ASYNC ContentScraped] \t INSERTED INTO TABLE :: gw.SQL_CONN_OPEN = {} =============== ".format(row[0],row[1],gw.SQL_CONN_OPEN))
    #             conn.commit()
    #             conn.close()
    #             gw.SQL_CONN_OPEN -= 1
    #     except Exception as e:
    #         if series_count == gw.ASYNC_SERIES_CONNECTION:      # dont count the errors in each series run.Some might get ressolved in next one.
    #             gw.CS_ASYNC_SEMA_EXCEPTION_ERR += 1
    #             pc.printWarn("\t======= XXXXXXXXXXXXXX ======>> <ID = {}><src= {} > NOW = {} Async Scraping failed.Will try SYNC later... \n \t\t ERROR=> {}".format(row[0],row[1],time.strftime("%H:%M:%S", time.localtime()) ,e))
    #             # logging.error(traceback.format_exc())
    #         pass
    # return []
    async with sem:
        try:
            row = await fetchWithRetry(conn, row, session, series_count, ts)
            if row and len(row[13]) > 0:
                # await semaphoreSqlUpdate(row,ts)
                content = row[13]
        except Exception as e:
            if series_count == gw.ASYNC_SERIES_CONNECTION:  # dont count the errors in each series run.Some might get ressolved in next one.
                gw.CS_ASYNC_SEMA_EXCEPTION_ERR += 1
                pc.printWarn(
                    "\t======= XXXXXXXXXXXXXX ======>> <ID = {}><src= {} > NOW = {} Async Scraping failed.Will try SYNC later... \n \t\t ERROR=> {}"
                    .format(row[0], row[1],
                            time.strftime("%H:%M:%S", time.localtime()), e))
                # logging.error(traceback.format_exc())
            pass
    return []

Exemple #5

0

Afficher le fichier

Fichier : th_query.py Projet : aayush4vedi/CynicalReader

def return_all_descendents(ts, root):
    """
        Returns all the descendents of node in tag-tree where node.NodeName = root
    """

    pc.printMsg(
        " \t\t???????????????????????????????????? Query for All Descendents of NodeName = {}"
        .format(root))

    descendents = []
    th_db = 'dbs/th.db'
    th_table = 'th_' + str(int(ts))
    conn = sqlite3.connect(th_db, timeout=10)
    c = conn.cursor()
    pc.printMsg(
        "\t -------------------------------------- < query_children_th: DB Connection Opened > ---------------------------------------------\n"
    )

    q = 'select LeftMptt, RightMptt from ' + th_table + ' where NodeName = ?'
    root_mptt_values = c.execute(q, ('{}'.format(root), ))
    root_mptt_values = c.fetchone()
    if root_mptt_values is None:
        pc.printErr(
            " \t\tXXXXXXXXXXXXX-> Asked node with name = {} not found in table = {} \t...... returning NULL as descendents"
            .format(root, th_table))
        return descendents

    pc.printMsg(" root.LeftMptt = {} , root.RightMptt = {} \n".format(
        root_mptt_values[0], root_mptt_values[1]))

    q = 'select * from ' + th_table + ' where LeftMptt > ? AND RightMptt < ?'
    d = (root_mptt_values[0], root_mptt_values[1])
    rows_head = c.execute(q, d)
    rows = rows_head.fetchall()
    for row in rows:
        pc.printWarn(" \t\t * DESCENDENT of {} :: {}".format(root, row))
        descendents.append(row)

    conn.commit()
    conn.close()
    pc.printMsg(
        "\t -------------------------------------- < query_children_th: DB Connection Closed > ---------------------------------------------\n"
    )
    return descendents

Exemple #6

0

Afficher le fichier

Fichier : th_query.py Projet : aayush4vedi/CynicalReader

def return_imm_children(ts, root):
    """
        INPUT: ts, root.name (string)

        OUTPUT:
            Returns full row(ID, NodeName, LeftMptt, RightMptt, DepthLevel, ItemCount, AvgPopI, HN_IDs,R_IDs) 
            of just the immediate children of node in tag-tree where node.NodeName = root
    """

    # pc.printMsg(" \t\t ???????????????????????????????????? Query for Immediate Children of NodeName = {}".format(root))

    children = []
    th_db = 'dbs/th.db'
    th_table = 'th_' + str(int(ts))
    conn = sqlite3.connect(th_db, timeout=10)
    c = conn.cursor()
    # pc.printMsg("\t -------------------------------------- < query_children_th: DB Connection Opened > ---------------------------------------------\n")

    q = 'select LeftMptt, RightMptt, DepthLevel from ' + th_table + ' where NodeName = ? ;'
    root_mptt_values = c.execute(q, ('{}'.format(root), ))
    root_mptt_values = c.fetchone()
    if root_mptt_values is None:
        pc.printErr(
            " \t\tXXXXXXXXXXXXX-> Asked node with name = {} not found in table = {} \t...... returning NULL as children"
            .format(root, th_table))
        return children

    pc.printMsg(
        " \t ROOT: {} \troot.LeftMptt = {} , root.RightMptt = {} , root.DepthLevel = {}\n"
        .format(root, root_mptt_values[0], root_mptt_values[1],
                root_mptt_values[2]))

    q = 'select * from ' + th_table + ' where LeftMptt > ? AND RightMptt < ? And DepthLevel = ? '
    d = (root_mptt_values[0], root_mptt_values[1], root_mptt_values[2] + 1)
    rows_head = c.execute(q, d)
    rows = rows_head.fetchall()
    for row in rows:
        pc.printWarn(" \t\t *  CHILD of {} :: {}".format(root, row[1]))
        children.append(row)

    conn.commit()
    conn.close()
    # pc.printMsg("\t -------------------------------------- < query_children_th: DB Connection Closed > ---------------------------------------------\n")
    return children

Exemple #7

0

Afficher le fichier

async def fetchWithRetry(row, session):
    status = 400
    retry_cnt = 3
    sleep_time = 10
    TIMEOUT = 60

    while retry_cnt > 0 and status != 200:
        async with session.get(row["Url"],
                               ssl=ssl.create_default_context(
                                   purpose=ssl.Purpose.CLIENT_AUTH),
                               timeout=TIMEOUT) as response:
            res = await response.text()
            status = response.status
            if (status == 200 and len(res) != 0):
                pc.printSucc(
                    "\t\t <ID = {}><src= {} > ============== Scraping Done....... \t NOW: {}"
                    .format(row["ID"], row["SourceSite"],
                            time.strftime("%H:%M:%S", time.localtime())))
                urlstrings = text_actions.getUrlString(row["Content"])
                row["WeightedContent"] = text_actions.clean_text(
                    text_actions.weightedcontentfromhtml(res) + row["Title"] +
                    urlstrings)
                row["Content"] = text_actions.clean_text(
                    text_actions.contentfromhtml(res) + urlstrings)
                if (len(row["Content"]) == 0):
                    row["WeightedContent"] = text_actions.clean_text(
                        row["Title"])
                    row["Content"] = text_actions.clean_text(row["Title"])
                # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row["ID"],row["SourceSite"],time.strftime("%H:%M:%S", time.localtime())))
                # time.sleep(0.001)
                return row
            else:
                retry_cnt -= 1
                pc.printWarn(
                    "\t x---------------- Unable to hit URL(ERR_CODE={}): {}  Sleeping for {} Retries remaining = {} -------------x"
                    .format(status, row["Url"], sleep_time, retry_cnt))
                await asyncio.sleep(sleep_time)
    pc.printErr(
        "\t\txxxxx SKIPPING... for <ID = {}><src= {} > Unable to hit url: {} , "
        .format(row["ID"], row["SourceSite"], row["Url"]))
    global SKIPPED_ASYNC
    SKIPPED_ASYNC += 1
    return row

Exemple #8

0

Afficher le fichier

def run(ts):
    startTime = time.time()

    try:
        update_modelTags(ts)
    except Exception as e:
        pc.printErr(
            " xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running Tagger Simulator for wc table xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\t>>> Error = {}"
            .format(str(e)))
        logging.error(traceback.format_exc())
        pass

    endTime = time.time()

    pc.printSucc(
        "**************************** Tagger(Simulator) Run is Complete for wc **********************************************"
    )
    pc.printWarn(
        "| \t\t TIME TAKEN FOR Tagger(Simulator) Run(sec)     \t\t | \t\t {}  \t\t |"
        .format(round((endTime - startTime), 5)))
    pc.printSucc(
        "***********************************************************************************************************************\n\n"
    )

Exemple #9

0

Afficher le fichier

def updateLeafNodes(ts):
    """     
        This is the query:
           select count(ID) from wc_1601292562 where ModelTags like "%prog_query%" or SourceTags like "%prog_query%";

    """

    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    pc.printSucc(
        '@[{}] >>>>>> Started  UpdateLeafNodes@wc ................... => TABLE: {}\n'
        .format(datetime.fromtimestamp(ts), wc_table))
    conn = sqlite3.connect(wc_db, timeout=10)
    c = conn.cursor()
    pc.printMsg(
        "\t -------------------------------------- < UpdateLeafNodes@wc : DB Connection Opened > ---------------------------------------------\n"
    )
    pc.printWarn("\tRunning UpdateLeafNodes for wc ....... \t NOW: {}".format(
        time.strftime("%H:%M:%S", time.localtime())))
    pc.printWarn(
        "\t\t. .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  ."
    )
    startTime = time.time()

    for tag in tags_names:
        q = 'select count(ID) from ' + wc_table + ' where ModelTags like ? or SourceTags like ?'
        d = (
            '%"{}"%'.format(tag),
            '%"{}"%'.format(tag),
        )
        item_count = c.execute(q, d)
        item_count = c.fetchone()[0]
        q = 'select avg(PopI) from ' + wc_table + ' where ModelTags like ? or SourceTags like ?'
        avg_popi = c.execute(q, d)
        avg_popi = c.fetchone()[0]
        if avg_popi == None:
            avg_popi = 0
        else:
            avg_popi = round(avg_popi, 10)
        curr_node = node_dict[tag]
        if curr_node.isTag:  #update only if its a leaf
            curr_node.count = item_count
            curr_node.popi = avg_popi
            pc.printSucc(
                " \t\t\t..... Updated node: {}  \t => c = {}  , p = {}".format(
                    curr_node.name, item_count, avg_popi))

    endTime = time.time()
    conn.commit()
    conn.close()
    pc.printMsg(
        "\t -------------------------------------- < UpdateLeafNodes@wc: DB Connection Closed > ---------------------------------------------\n"
    )
    pc.printWarn(
        "\t\t ---------------> TIME TAKEN FOR UpdateLeafNodes In Tree  (sec)   =>  {} \n"
        .format(round((endTime - startTime), 5)))

Exemple #10

0

Afficher le fichier

def update_modelTags(ts):
    """
        runs on the table(wc_ts) in wc.db & update ModelTag
    """
    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    pc.printSucc(
        '@[{}] >>>>>> Started  TaggerSimulator@wc ................... => TABLE: {}\n'
        .format(datetime.fromtimestamp(ts), wc_table))
    conn = sqlite3.connect(wc_db, timeout=10)
    c = conn.cursor()
    pc.printMsg(
        "\t -------------------------------------- < TaggerSimulator@wc : DB Connection Opened > ---------------------------------------------\n"
    )
    pc.printWarn("\tRunning PopiCalculator for wc ....... \t NOW: {}".format(
        time.strftime("%H:%M:%S", time.localtime())))
    pc.printWarn(
        "\t\t. .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  ."
    )
    startTime = time.time()

    q = "select * from " + wc_table
    rows_head = c.execute(q)
    rows = rows_head.fetchall()
    for row in rows:
        """
            ============= row is an array with indices: 
            ID(0),SourceSite(1),ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5),Url(6),
            SourceTags(7),ModelTags(8),NumUpvotes(9),NumComments(10),PopI(11),WeightedContent(12),Content(13)
        """
        modelTags = []

        #TODO: call actual Api here, when model is ready
        # pc.printMsg("\t <ID = {}><src= {} > [Tagger] Start................ ".format(row[0],row[1]))

        conf_arr = SimulatorApi(row[13], row[12])
        for item in conf_arr:
            tag = item[0]
            conf = item[1]
            if (conf >= tags_threshold[tag]):
                modelTags.append(tag)
                # pc.printWarn(" \t\t\t\t => Added \t {} \t conf = {}".format(tag,conf))
        modelTags = json.dumps(modelTags)
        query = 'update ' + wc_table + ' set ModelTags = ? where ID = ? and SourceSite = ?'
        data = (modelTags, row[0], row[1])
        c.execute(query, data)

    endTime = time.time()
    conn.commit()
    conn.close()
    pc.printMsg(
        "\t -------------------------------------- < TaggerSimulator@wc: DB Connection Closed > ---------------------------------------------\n"
    )
    pc.printWarn(
        "\t\t ---------------> TIME TAKEN FOR TaggerSimulator@wc(sec)    =>  {} => TABLE: {}\n"
        .format(round((endTime - startTime), 5), wc_table))

Exemple #11

0

Afficher le fichier

Fichier : content_scraper.py Projet : aayush4vedi/CynicalReader

def run(ts):
    nest_asyncio.apply()  #  to be able to run async loop from aj async loop

    wc_table = 'wc_' + str(int(ts))
    pc.printMsg(
        '@[{}] >>>>>> Started Content-scraper(ASYNC) .......[Sema = {}, conn_lim ={}]............ => TABLE: {}\n'
        .format(datetime.fromtimestamp(ts), gw.SEMAPHORE_COUNT,
                gw.CONNECTION_COUNT, wc_table))

    startTime = time.time()
    """ scrape content in async """
    asyncio.get_event_loop().run_until_complete(
        asyncio.ensure_future(RunAsync(ts)))
    time.sleep(10)
    """ scrape remaining items with sync """
    RunSync(ts)
    """ formatting everything in the end-done in sync """
    time.sleep(10)
    ContentFormatting(ts)

    endTime = time.time()
    pc.printSucc(
        "\n\n\n\n\n****************** Content Scraping is Complete , TABLE: {} ********************"
        .format(wc_table))
    print("\n\n")
    table = PrettyTable(
        ['Entities (Post Content Scraping-all)', 'Notation(if any)', 'Value'])

    table.add_row([
        'IN : gw.WC_TOTAL_URL_ENTRIES ', '[X] (A+B+C=X)',
        gw.WC_TOTAL_URL_ENTRIES
    ])
    table.add_row([
        'CS_OUT : ITEMS SCRAPED WITH ASYNC', '[A] (A+B+C=X)',
        gw.CS_ASYNC_ITEM_SCRAPED
    ])
    table.add_row([
        'CS_OUT : ITEMS WRITTEN DIRECT(no scraping needed) ', '[B] (A+B+C=X)',
        gw.CS_ITEMS_WRITTEN_DIRECT
    ])
    table.add_row([
        'CS_OUT : ITEMS SCRAPED WITH SYNC', '[C] (A+B+C=X)',
        gw.CS_SYNC_ITEM_SCRAPED
    ])
    table.add_row([
        'CF_OUT : ITEMS PUT IN WITH SCRAPED CONTENT', '[P] (P+Q=X)',
        gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_OK
    ])
    table.add_row([
        'CF_OUT : x--ITEMS PUT IN WITH TITLE AS CONTENT--x', '[Q] (P+Q=X)',
        gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_NO_CONTENT
    ])

    pc.printSucc(table)

    pc.printErr(
        "\n\n------------------------------------------ ERRORS (Written nonetheless, chill) ------------------------------------------------\n"
    )
    table = PrettyTable(['Failures (Post Content Scraping-all)', 'Value'])
    table.add_row(
        ['COUNT. UNREACHABLE URLS - ASYNC ', gw.CS_ASYNC_URL_UNREACHABLE])
    table.add_row([
        'COUNT. TRY/CATCHED SEMA EXCEP. - ASYNC ',
        gw.CS_ASYNC_SEMA_EXCEPTION_ERR
    ])
    table.add_row(
        ['COUNT. UNREACHABLE URLS - SYNC ', gw.CS_SYNC_URL_UNREACHABLE])
    table.add_row([
        'COUNT. TRY/CATCHED EXCEP. - SYNC ',
        gw.CS_SYNC_TRIED_CATCH_EXCEPTION_ERR
    ])
    pc.printErr(table)
    print("\n")
    pc.printWarn(
        '\t\t\t\t------------------------->>>>>> [ Time Taken(min) = {} ]\n\n\n\n\n\n'
        .format(round((endTime - startTime), 5) / 60))
    print("\n\n\n\n")

Exemple #12

0

Afficher le fichier

def run(ts):
    """
        Scrapes Algolia's HN api for last 7 days & puts data in WC-DB.
            * max number of entries in algolia's single api call = 1000. So scrape for one day at a time
            * Link to documentation: https://hn.algolia.com/api
        Note:
            1. For AskHN entries put `` tag & separate threshold
            1. For ShowHN entries put `` tag & separate threshold
            1. For Jobs@HN entries put `` tag => later as these entries dont have upvotes/comments
        Input: ts (format: 1598692058.887741)
    """
    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    pc.printSucc('@[{}] >>>>>> Started HN-scraper ................... => TABLE: {}\n'.format(datetime.fromtimestamp(ts),wc_table))
    conn = sqlite3.connect(wc_db, timeout=10)
    c = conn.cursor()
    pc.printMsg("\t -------------------------------------- < HN_SCRAPER: DB/wc Connection Opened > ---------------------------------------------\n")
    startTime = time.time()

    """
        here is how you add day to `ts`:

        from datetime import datetime, timedelta
        newts = datetime.fromtimestamp(ts) + timedelta(days=1) # 2020-08-30 16:02:34.352094
        newts.timestamp() # 1598783633.284871
        datetime.fromtimestamp(ts) #2020-08-29 17:15:32
    """

    """ ts_arr has last 7 days(including today's) (non-decimal stype)timestamps strings 
        TIP: use `datetime.fromtimestamp(int(t))` to convert to human readable format
    """
    ts_arr = [str(int(ts))]

    for i in range(6):
        new_ts = datetime.fromtimestamp(int(ts_arr[-1])) + timedelta(days=-1)
        new_ts = new_ts.timestamp()
        ts_arr.append(str(int(new_ts)))

    # for t in ts_arr:
    #     print("timestamp: {} \t date: {}".format(t,datetime.fromtimestamp(int(t))))

    index = gw.WC_TOTAL_URL_ENTRIES + 1

    for i in range(len(ts_arr)-1):
        startepoch = ts_arr[i]
        endepoch   = ts_arr[i+1]
        pc.printMsg(" ................. scraping for interval: start= {} -> end = {} .................\n".format(startepoch,endepoch))
        
        """ 
            getting stories(articles) with upvotes_count > upvotes_threshold 
            Also including:
                1. TellHN (<tech_discuss>)
                2. LaunchHN (<startup>)
        """
        pc.printWarn(" \t............. scraping stories .............")
        try:
            url_story = 'http://hn.algolia.com/api/v1/search_by_date?tags=story&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_STORY_UPVOTE_TH)
            data = web_requests.hitGetWithRetry(url_story)
            res_size = json.loads(data.content)["nbHits"]

            pc.printMsg("\t\t\t\t====> Item count: {}".format(res_size))

            gw.HN_TOTAL_ITEMS_GOT_YET += res_size
            items_arr = json.loads(data.content)["hits"]

            for item in items_arr:
                url = 'https://news.ycombinator.com/item?id='+str(item["objectID"])
                sourceTag = ''
                content = ''
                sourceSite = 'HN'
                if(item["url"] is None): #as all ShowHNs may not have an url ...hihi...
                    # print( '------------------------- found null urled value ---------------------\n-----[STORY]url: {}'.format(url))
                    # print(json.dumps(item, indent = 4))
                    if(item["story_text"] is not None):
                        content = text_actions.getTextFromHtml(item["story_text"])
                    if("Launch HN:" in item["title"]):                                    # 1. LaunchHN
                        sourceTag = 'startup'
                        sourceSite += '/launch'
                    if("Tell HN:" in item["title"]):                                      # 2. TellHN
                        sourceTag = 'tech_discuss'
                        sourceSite += '/tell'
                else:
                    url = item["url"] 
                entry = [
                    index,
                    sourceSite,
                    datetime.fromtimestamp(ts).date(),
                    int(ts),
                    date_conversion.HNDate(str(item["created_at"])),
                    item["title"],              
                    url,
                    sourceTag,
                    '',
                    item["points"],
                    item["num_comments"],
                    '',
                    '',
                    text_actions.clean_text(content)
                    ]
                c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
                index=index+1

            pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET))
        except Exception as e:
            pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e))
            logging.error(traceback.format_exc())
            pass

        """ getting ShowHNs """
        pc.printWarn("\t............. scraping showHNs .............")
        try:
            url_show = 'http://hn.algolia.com/api/v1/search_by_date?tags=show_hn&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_SHOWHN_UPVOTE_TH)
            data = web_requests.hitGetWithRetry(url_show)
            res_size = json.loads(data.content)["nbHits"]

            pc.printMsg("\t\t\t\t====> Item count: {}".format(res_size))
            
            gw.HN_TOTAL_ITEMS_GOT_YET += res_size
            items_arr = json.loads(data.content)["hits"]

            for item in items_arr:
                content = ''
                sourceSite = 'HN/show'
                if(item["url"] is None): #as all ShowHNs may not have an url ...hihi...
                    url = 'https://news.ycombinator.com/item?id='+str(item["objectID"])
                    # print( '-------------------------- found null urled value ---------------------\n-----[SHOW]url: {}'.format(url))
                    # print(json.dumps(item, indent = 4))
                    if(item["story_text"] is not None):
                        content = text_actions.getTextFromHtml(item["story_text"])
                else:
                    url = item["url"] 
                entry = [
                    index,
                    sourceSite,
                    datetime.fromtimestamp(ts).date(),
                    int(ts),
                    date_conversion.HNDate(str(item["created_at"])),
                    item["title"],              
                    url,
                    'sideproj',
                    '',
                    item["points"],
                    item["num_comments"],
                    '',
                    '',
                    text_actions.clean_text(content)
                    ]
                c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
                index=index+1

            pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET))
        except Exception as e:
            pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e))
            logging.error(traceback.format_exc())
            pass


        """ getting AskHNs """

        pc.printWarn("\t............. scraping askHNs .............")
        try:
            url_ask = 'http://hn.algolia.com/api/v1/search_by_date?tags=ask_hn&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_ASKHN_UPVOTE_TH)
            data = web_requests.hitGetWithRetry(url_ask)
            res_size = json.loads(data.content)["nbHits"]

            pc.printWarn("\t\t\t\t====> Item count: {}".format(res_size))

            gw.HN_TOTAL_ITEMS_GOT_YET += res_size
            items_arr = json.loads(data.content)["hits"]
            

            for item in items_arr:
                content = ''
                sourceSite = 'HN/ask'
                if(item["url"] is None): #as AskHNs dont have any url ...hihi...
                    url = 'https://news.ycombinator.com/item?id='+str(item["objectID"])
                    # print( '-------------------------- found null urled value ---------------------\n-----[ASK]url: {}'.format(url))
                    # print(json.dumps(item, indent = 4))
                    if(item["story_text"] is not None):
                        content = text_actions.getTextFromHtml(item["story_text"])
                else:
                    url = item["url"] 
                entry = [
                    index,
                    sourceSite,
                    datetime.fromtimestamp(ts).date(),
                    int(ts),
                    date_conversion.HNDate(str(item["created_at"])),
                    item["title"],              
                    url,
                    'prog_query',
                    '',
                    item["points"],
                    item["num_comments"],
                    '',
                    '',
                    text_actions.clean_text(content)
                    ]
                c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
                index=index+1
            pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET))
        except Exception as e:
            pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e))
            logging.error(traceback.format_exc())
            pass

    endTime = time.time()
    conn.commit()
    conn.close()
    gw.WC_TOTAL_URL_ENTRIES += gw.HN_TOTAL_ITEMS_GOT_YET
    pc.printMsg("\t -------------------------------------- < HN_SCRAPER: DB/wc Connection Closed > ---------------------------------------------\n")

    pc.printSucc("\n\n***************************** HN Url Scraping is Complete. TABLE: {} ******************".format(wc_table))
    print("\n\n")
    table = PrettyTable(['Entity (Post HN URL Scraping)', 'Value'])
    table.add_row(['TOTAL URLS FETCHED by HN', gw.HN_TOTAL_ITEMS_GOT_YET])
    table.add_row(['TOTAL ITEMS IN WC TABLE YET', gw.WC_TOTAL_URL_ENTRIES])
    table.add_row(['TIME TAKEN FOR URL SCRAPING-HN (sec) ', round((endTime - startTime),5)])
    pc.printSucc(table)
    print("\n\n")

Exemple #13

0

Afficher le fichier

Fichier : content_scraper_failed_csv.py Projet : aayush4vedi/CynicalReader

async def fetchWithRetry(row, session, csv_out):
    """
        Hits ulr(with retires):
        * if status == 200:
            put content into csv
        * if still unable to hit after retries: Content = Title , WeightedContent = Title
    """

    status = 400
    retry_cnt = 2
    sleep_time = 10
    TIMEOUT = 10

    while retry_cnt > 0 and status != 200:
        async with session.get(row["Url"],
                               ssl=ssl.create_default_context(
                                   purpose=ssl.Purpose.CLIENT_AUTH),
                               timeout=TIMEOUT) as response:
            res = await response.text()
            status = response.status
            if (status == 200 and len(res) != 0):
                pc.printSucc(
                    "\t\t <ID = {}><src= {} > ============== Scraping Done....... \t NOW: {}"
                    .format(row["ID"], row["SourceSite"],
                            time.strftime("%H:%M:%S", time.localtime())))
                urlstrings = text_actions.getUrlString(row["Content"])
                row["WeightedContent"] = text_actions.weightedcontentfromhtml(
                    res) + row["Title"] + urlstrings
                row["Content"] = text_actions.contentfromhtml(res) + urlstrings
                # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row["ID"],row["SourceSite"],time.strftime("%H:%M:%S", time.localtime())))
                # time.sleep(0.001)
                if (len(row["Title"]) != 0):
                    if len(row["Content"]) == 0:
                        row["WeightedContent"] = row["Title"]
                        row["Content"] = row["Title"]
                    await write_result(csv_out, row)
                    global WRITTEN_ENTRIES_ASYNC_SCRAPED
                    WRITTEN_ENTRIES_ASYNC_SCRAPED += 1
                    pc.printMsg(
                        " \t\t ============== [Scraped] Done Writing into csv for <ID = {}><src= {} > =============== "
                        .format(row["ID"], row["SourceSite"]))
                else:
                    global WRITTEN_ENTRIES_ASYNC_NO_CONTENT_IN_SCRAPING
                    WRITTEN_ENTRIES_ASYNC_NO_CONTENT_IN_SCRAPING += 1
                    pc.printErr(
                        "\t\t xxxxxxxxxxxxxxxxxxx SKIPPING  for <ID = {}><src= {} > As No Title xxxxxxxxxxxxxxxxxxxxxxxx\n"
                        .format(row["ID"], row["SourceSite"]))
                return row
            else:
                retry_cnt -= 1
                pc.printWarn(
                    "\t x---------------- <ID = {}><src= {} > Unable to hit URL(ERR_CODE={}): {}.........  Sleeping for {} Retries remaining = {} -------------x"
                    .format(row["ID"], row["SourceSite"], status,
                            row["Url"][:25], sleep_time, retry_cnt))
                await asyncio.sleep(sleep_time)
    pc.printErr(
        "\t\txxxxx  For <ID = {}><src= {} >Totally unable to hit url.... using Title for Content & WeightedContent : {} "
        .format(row["ID"], row["SourceSite"], row["Url"]))
    if len(row["Content"]) == 0:
        row["WeightedContent"] = row["Title"]
        row["Content"] = row["Title"]
    await write_result(csv_out, row)
    global WRITTEN_ENTRIES_ASYNC_ON_URL_ERROR
    WRITTEN_ENTRIES_ASYNC_ON_URL_ERROR += 1
    pc.printMsg(
        " \t\t\t ============== [Unreachable URL] Done Writing into csv for <ID = {}><src= {} > =============== "
        .format(row["ID"], row["SourceSite"]))
    return row

Exemple #14

0

Afficher le fichier

Fichier : popi_calculator.py Projet : aayush4vedi/CynicalReader

def run_wp(ts):
    """
        runs on the table(wp_ts) in wp.db & updates PopI column in it
    """

    wp_db = 'dbs/wp.db'
    wp_table = 'wp_' + str(int(ts))
    pc.printSucc('@[{}] >>>>>> Started  PopICalculator@wp ................... => TABLE: {}\n'.format(datetime.datetime.fromtimestamp(ts),wp_table))
    conn = sqlite3.connect(wp_db, timeout=10)
    c = conn.cursor()
    pc.printMsg("\t -------------------------------------- <  PopICalculator@wp : DB/wp Connection Opened > ---------------------------------------------\n")
    startTime = time.time()
    pc.printWarn("\tRunning PopiCalculator for wp ....... \t NOW: {}".format(time.strftime("%H:%M:%S", time.localtime())))
    pc.printWarn("\t\t. .  .  .  .  .  .  .  .  .  .  .  .  ....... PopI Calculation for wp table Started .......  .  .  .  .  .   .  .  .  .  .  .  .  .  .  .  .  .  .  .")

    days = GetLastSevenDays(ts)

    """ Initialize both maps(weekly & daily): key = PopiItem, Value = (max_upvotes, max_comments) """

    DailyMaxMap = collections.defaultdict(list)
    WeeklyMaxMap = collections.defaultdict(list)

    q = "select * from " + wp_table
    rows_head = c.execute(q)
    rows = rows_head.fetchall()
    for row in rows:
        """
           * ============= row is an array with indices: 
            (ID(0), SourceSite(1), ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5), Url(6),
            ThumbnailUrl(7),SourceTags(8),NumUpvotes(9),NumComments(10),PopI(11),Content(12))
        """
        popi_item_daily = PopiItem(row[1],row[4])
        popi_item_weekly = PopiItem(row[1],row[2])

        # for daily max
        if popi_item_daily in DailyMaxMap:
            max_upvotes_day = DailyMaxMap[popi_item_daily][0]
            max_comments_day = DailyMaxMap[popi_item_daily][1]
        else:
            q = "select max(NumUpvotes) from " + wp_table + " where SourceSite = ? and CreationDate = ?"
            d = (row[1],row[4])
            max_upvotes_day = c.execute(q,d)
            max_upvotes_day = c.fetchone()[0]
            q = "select max(NumComments) from " + wp_table + " where SourceSite = ? and CreationDate = ?"
            max_comments_day = c.execute(q,d)
            max_comments_day = c.fetchone()[0]
            DailyMaxMap[popi_item_daily] = (max_upvotes_day,max_comments_day)

        # For weekly max
        if popi_item_weekly in WeeklyMaxMap:
            max_upvotes_week = WeeklyMaxMap[popi_item_daily][0]
            max_comments_week = WeeklyMaxMap[popi_item_daily][1]
        else:
            q = "select max(NumUpvotes) from " + wp_table + " where SourceSite = ? and ProcessingDate = ?"
            d = (row[1],row[2])
            max_upvotes_week = c.execute(q,d)
            max_upvotes_week = c.fetchone()[0]
            q = "select max(NumComments) from " + wp_table + " where SourceSite = ? and ProcessingDate = ?"
            max_comments_week = c.execute(q,d)
            max_comments_week = c.fetchone()[0]
            WeeklyMaxMap[popi_item_weekly] = (max_upvotes_week,max_comments_week)

        popI = CalculatePopi(row[9],row[10],max_upvotes_day, max_comments_day, max_upvotes_week, max_comments_week,row[4],days[6],row[1])
        # pc.printWarn(" \t\t [wc_popi calculation] <ID={}><Source={}> ...................... PopI = {}".format(row[0],row[1],popI))
        # pc.printMsg("\t\t\t\t ........................ Updated PopI in wp_table..............")
        query = 'update ' + wp_table + ' set PopI = ? where ID = ? and SourceSite = ?'
        data = (popI,row[0],row[1])
        c.execute(query,data)

    endTime = time.time()
    conn.commit()
    conn.close()
    pc.printMsg("\t -------------------------------------- < PopICalculator@wp: DB/wp Connection Closed > ---------------------------------------------\n")
    pc.printWarn("\t\t ---------------> TIME TAKEN FOR PopICalculator@wp    =>  {} => TABLE: {}\n".format(round((endTime - startTime),5),wp_table))

Exemple #15

0

Afficher le fichier

Fichier : content_scraper_one_point_o.py Projet : aayush4vedi/CynicalReader

def RunAsync(ts):
    """
        Pick wc-db's table mapped with `ts` and scrapes (useful) "clean" Content & WeightedContent from url- ASYNCLY
        * NOTE:
            * If conent is already present in the table, "clean" it too & append the newly scraped content to it.
            * FIRST RUN: time = 17 hours, data = 12 MB, #entries = 6.5k
        Input: ts (format: 1598692058.887741)
    """
    global CONNTECTION_COUNT, SEMAPHORE_COUNT
    wc_table = 'wc_' + str(int(ts))
    pc.printMsg(
        '@[{}] >>>>>> Started Content-scraper(ASYNC) .......[Sema = {}, conn_lim ={}]............ => TABLE: {}\n'
        .format(datetime.fromtimestamp(ts), SEMAPHORE_COUNT, CONNTECTION_COUNT,
                wc_table))

    stratTime = time.time()
    # csv_src_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_'+str(int(ts))+'.csv'
    # csv_dest_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_'+str(int(ts))+'_wc.csv'

    # Run the async job
    asyncio.get_event_loop().run_until_complete(
        asyncio.ensure_future(asyncFetchAll(ts)))

    endTime = time.time()
    pc.printSucc(
        "\n****************** (Async)Content Scraping is Complete , TABLE: {} ********************"
        .format(wc_table))

    pc.printMsg(
        "\n--------------------------------------------------------------------------------------------------------------------------------"
    )
    pc.printMsg(
        "|\t\t IN : Total Entries in Url-Scraped Output Table                   \t\t  | \t\t {} \t\t|"
        .format(ENTRIES_TO_BE_WRITTEN))
    pc.printMsg(
        "|\t\t OUT: WRITTEN_ENTRIES_ASYNC_DIRECT(content exists)                \t\t  | \t\t {} \t\t|"
        .format(WRITTEN_ENTRIES_ASYNC_DIRECT))
    pc.printMsg(
        "|\t\t OUT: WRITTEN_ENTRIES_ASYNC_SCRAPED(scraped entries)              \t\t  | \t\t {} \t\t|"
        .format(WRITTEN_ENTRIES_ASYNC_SCRAPED))
    pc.printErr(
        "\n\n------------------ ERRORS In Scraping (Written nonetheless; counted in  WRITTEN_ENTRIES_ASYNC_SCRAPED) --------------------------\n"
    )
    pc.printMsg(
        "================================================================================================================================="
    )
    pc.printErr(
        "|\t\t ERR_ASYNC_NO_CONTENT_IN_SCRAPING(url hit;not content-written )   \t\t  | \t\t {} \t\t|"
        .format(ERR_ASYNC_NO_CONTENT_IN_SCRAPING))
    pc.printErr(
        "|\t\t ERR_ASYNC_ON_URL_ERROR(url not hit)                              \t\t  | \t\t {} \t\t|"
        .format(ERR_ASYNC_ON_URL_ERROR))
    pc.printErr(
        "|\t\t ERR_ASYNC_TRIED_ERR(other try/catch errs)                        \t\t  | \t\t {} \t\t|"
        .format(ERR_ASYNC_TRIED_ERR))
    pc.printMsg(
        "---------------------------------------------------------------------------------------------------------------------------------\n"
    )
    pc.printWarn(
        '\t\t\t\t------------------------->>>>>> [ Semaphore Count = {}, Tcp connector limit ={} ]\n'
        .format(SEMAPHORE_COUNT, CONNTECTION_COUNT))
    pc.printWarn(
        '\t\t\t\t------------------------->>>>>> [ Time Taken(sec) = {} ]\n'.
        format(int(endTime - stratTime)))

Exemple #16

0

Afficher le fichier

async def asyncFetchAll(csv_in, csv_out):
    """
        INPUT: csv_src_file & csv_dest_file(to be written)
        NOTE: 
            * Semaphore limit is: 500
            * While writing the response to csv_dest_file, it is done in chunks of `N` entries at a time
    """

    tasks = []
    sem = asyncio.Semaphore(1000)
    """ Initialize the output file """
    headers = [
        'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch',
        'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags',
        'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content'
    ]
    csv_functions.creteCsvFile(csv_out, headers)

    connector = TCPConnector(limit=0)
    async with ClientSession(headers={'Connection': 'keep-alive'},
                             connector=connector) as session:
        with open(csv_in, mode='r') as csvfile:
            csv_reader = csv.DictReader(csvfile)
            line_count = 0
            global ENTRIES_TO_BE_WRITTEN
            for row in csv_reader:
                ENTRIES_TO_BE_WRITTEN += 1
                if (len(row["Content"]) != 0):
                    pc.printWarn(
                        "\t <ID = {}><src= {} > [NO SCRAPING] Content already exists............... NOW: {}"
                        .format(row["ID"], row["SourceSite"],
                                time.strftime("%H:%M:%S", time.localtime())))
                    row["WeightedContent"] = text_actions.clean_text(
                        row["Title"] +
                        row["WeightedContent"]) + text_actions.getUrlString(
                            row["Content"])
                    row["Content"] = text_actions.clean_text(
                        row["Content"]) + text_actions.getUrlString(
                            row["Content"])
                    entry = [
                        row["ID"],
                        row["SourceSite"],
                        row["ProcessingDate"],
                        row["ProcessingEpoch"],
                        row["CreationDate"],
                        row["Title"],
                        row["Url"],
                        row["SourceTags"],
                        row["ModelTags"],
                        row["NumUpvotes"],
                        row["NumComments"],
                        row["PopI"],
                        row["Content"],
                        row["WeightedContent"],
                    ]
                    csv_functions.putToCsv(csv_out, entry)
                    global WRITTEN_ENTRIES_ASYNC_DIRECT
                    WRITTEN_ENTRIES_ASYNC_DIRECT += 1
                    pc.printMsg(
                        " \t\t ============== Done Writing into csv for <ID = {}><src= {} >=============== "
                        .format(row["ID"], row["SourceSite"]))
                elif (row["Url"] and row["Title"]):
                    task = asyncio.ensure_future(
                        semaphoreSafeFetch(sem, row, session))
                    tasks.append(task)

        responses = await asyncio.gather(*tasks)

        pc.printMsg(
            "\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ len(responses):: to be scraped = {} @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n"
            .format(len(responses)))

        for row in responses:
            if row["Content"] or row["Title"]:
                if len(
                        row["Content"]
                ) == 0:  # that means url was hit successfully and content was generated
                    row["Content"] = row["Title"]
                entry = [
                    row["ID"],
                    row["SourceSite"],
                    row["ProcessingDate"],
                    row["ProcessingEpoch"],
                    row["CreationDate"],
                    row["Title"],
                    row["Url"],
                    row["SourceTags"],
                    row["ModelTags"],
                    row["NumUpvotes"],
                    row["NumComments"],
                    row["PopI"],
                    row["Content"],
                    row["WeightedContent"],
                ]
                await write_result(csv_out, entry)
                # csv_functions.putToCsv(csv_out, entry)
                global WRITTEN_ENTRIES_ASYNC_SCRAPED
                WRITTEN_ENTRIES_ASYNC_SCRAPED += 1
                pc.printMsg(
                    " \t\t ============== Done Writing into csv for <ID = {}><src= {} > =============== "
                    .format(row["ID"], row["SourceSite"]))
            else:
                pc.printErr(
                    "\t\t xxxxxxxxxxxxxxxxxxx Skipping  for <ID = {}><src= {} > As No Content & Title xxxxxxxxxxxxxxxxxxxxxxxx\n"
                    .format(row["ID"], row["SourceSite"]))

Exemple #17

0

Afficher le fichier

Fichier : content_scraper.py Projet : aayush4vedi/CynicalReader

async def asyncFetchAll(
        conn, ts,
        series_count):  #series_count : {1,gw.ASYNC_SERIES_CONNECTION}
    """
        just add the content into Content column, no cleaning OR weightedContent OR UrlString etc. here.
    """

    # wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    # conn = sqlite3.connect(wc_db)
    # gw.SQL_CONN_OPEN += 1
    c = conn.cursor()
    q = "select * from " + wc_table + " where length(Content) = 0"  # only get the rows without content
    rows_head = c.execute(q)
    rows = rows_head.fetchall()
    conn.commit()
    # conn.close()
    # gw.SQL_CONN_OPEN -= 1
    pc.printMsg(
        "\t -------------------------------------- < CONTENT_SCRAPER_ASYNC: DB/wc Connection Opened > ---------------------------------------------\n"
    )
    startTime = time.time()

    socket.gethostbyname("")
    connector = TCPConnector(limit=gw.CONNECTION_COUNT,
                             family=socket.AF_INET,
                             verify_ssl=False)
    pc.printMsg(
        "\n\n===================================================================== Doing {}-th Async Scraping in the same table =====================================================================\n\n"
        .format(series_count))
    async with ClientSession(headers={'Connection': 'keep-alive'},
                             connector=connector) as session:

        tasks = []
        sem = asyncio.Semaphore(gw.SEMAPHORE_COUNT)
        for row in rows:
            """
                ============= row is an array with indices: 
                ID(0),SourceSite(1),ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5),Url(6),
                SourceTags(7),ModelTags(8),NumUpvotes(9),NumComments(10),PopI(11),WeightedContent(12),Content(13)
            """
            t1 = time.time()
            if (row[5] and row[6]):  # else ignore the entry
                gw.CS_BOYS_STILL_PLAYING += 1
                if gw.CS_BOYS_STILL_PLAYING % gw.CS_BOYS_PLAYING_LIMIT == 0:
                    pc.printMsg(
                        "\t [ASYNC_SCRAPING] sleeping for 1 sec...zzzzzzzzz....... \t BOYS_STILL_PLAYING = {}"
                        .format(gw.CS_BOYS_STILL_PLAYING))
                    time.sleep(1)
                # task = asyncio.ensure_future(semaphoreSafeFetch(sem, row, session,series_count))
                task = asyncio.ensure_future(
                    semaphoreSafeFetch(conn, sem, row, session, series_count,
                                       ts))
                tasks.append(task)

        await asyncio.gather(*tasks)
        # responses = await asyncio.gather(*tasks)
        # for row in responses:
        #     if row and len(row[13]) >0:
        #         try:
        #             content = row[13]
        #             conn = sqlite3.connect(wc_db)
        #             c = conn.cursor()
        #             q = 'update ' + wc_table + ' set Content = ? where ID = ? and SourceSite = ?'
        #             d = (content,row[0],row[1])
        #             c.execute(q,d)
        #             pc.printSucc(" \t\t ============== <ID= {} ><{}> [ASYNC ContentScraped] \t INSERTED INTO TABLE =============== ".format(row[0],row[1]))
        #             conn.commit()
        #             conn.close()
        #         except Exception as e:
        #             logging.error(traceback.format_exc())
        #         pass
        # succs = []
        # for row in responses:
        #     if row and len(row[13]) >0:
        #         succ = asyncio.ensure_future(semaphoreSqlUpdate(sem,row,ts))
        #         succs.append(succ)
        # succsx = await asyncio.gather(*succs)

    endTime = time.time()

    pc.printSucc(
        "\n***************************** {} -th Async Content Scraping is Complete. TABLE: {} ******************"
        .format(series_count, wc_table))
    print("\n\n")
    table = PrettyTable(
        ['Success (Post Async Content Scraping)', 'Notation(if any)', 'Value'])
    table.add_row([
        'OUT : TOTAL ITEMS SCRAPED WITH ASYNC YET', '[B] (A+B+C=X)',
        gw.CS_ASYNC_ITEM_SCRAPED
    ])
    pc.printSucc(table)
    print("\n")
    pc.printWarn(
        '\t\t\t------------------------->>>>>> [ TimeTaken for Async Scraping (min) = {} ]\n'
        .format(round((endTime - startTime), 5) / 60))
    print("\n\n")

Exemple #18

0

Afficher le fichier

Fichier : content_scraper.py Projet : aayush4vedi/CynicalReader

async def RunAsync(ts):
    """
        Does ASYNC_SERIES_CONNECTION times number of series executions in parallel
    """
    startTime = time.time()
    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    conn = sqlite3.connect(wc_db)
    """ get rows with content alredy present & put in gw.CS_ITEMS_WRITTEN_DIRECT .Will work just for 1st iteration"""
    c = conn.cursor()
    q = "select count(*) from " + wc_table + " where length(Content) != 0"
    no_scraping_needed_item_count = c.execute(q)
    no_scraping_needed_item_count = c.fetchone()[0]
    gw.CS_ITEMS_WRITTEN_DIRECT = no_scraping_needed_item_count

    conn.commit()
    # conn.close()
    # gw.SQL_CONN_OPEN -= 1

    for i in range(1, gw.ASYNC_SERIES_CONNECTION + 1):
        gw.CS_BOYS_STILL_PLAYING = 0
        pc.printMsg(
            "\n\n..........-------------\/\/\/------\/\/\/------\/\/\/---------------............  Running Async for {} -th time - \t Numer of Async-runs remaining: {} \t\t NOW: {}\n\n"
            .format(i, (gw.ASYNC_SERIES_CONNECTION - i),
                    time.strftime("%H:%M:%S", time.localtime())))
        # asyncio.get_event_loop().run_until_complete(asyncio.ensure_future(asyncFetchAll(ts,i)))
        await asyncFetchAll(conn, ts, i)
        pc.printMsg(
            "\t\t..........-------------\/\/\/------............  {} -th Async Running is done.Sleeping for 10 sec now......ZZZZZZZzzzzzzzzz\t\t NOW: {}\n\n"
            .format(i, time.strftime("%H:%M:%S", time.localtime())))
        time.sleep(10)

    conn.close()
    endTime = time.time()
    pc.printSucc(
        "\n\n***************************** All {} Async Content Scraping is Complete. TABLE: {} ******************"
        .format(gw.ASYNC_SERIES_CONNECTION, wc_table))
    print("\n\n")
    table = PrettyTable([
        'Success (Post ALL series Async Content Scraping)', 'Notation(if any)',
        'Value'
    ])
    table.add_row([
        'IN : gw.WC_TOTAL_URL_ENTRIES ', '[X] (A+B+C=X)',
        gw.WC_TOTAL_URL_ENTRIES
    ])
    table.add_row([
        'OUT : ITEMS WRITTEN DIRECT(no scraping needed) ', '[A] (A+B1+B2+C=X)',
        gw.CS_ITEMS_WRITTEN_DIRECT
    ])
    table.add_row([
        'OUT : ITEMS SCRAPED WITH ASYNC', '[B] (A+B+C=X)',
        gw.CS_ASYNC_ITEM_SCRAPED
    ])
    table.add_row([
        'TIME TAKEN - ASYNC CONTENT SCRAPING (min)', '-',
        round((endTime - startTime) / 60, 2)
    ])
    pc.printSucc(table)

    pc.printErr(
        "------------------------------------------ ERRORS-ASYNC (Written nonetheless, chill) ------------------------------------------------\n"
    )
    table = PrettyTable([
        'Failures (Counted as-in last run of Async Content Scraping)', 'Value'
    ])
    table.add_row(
        ['COUNT. UNREACHABLE URLS in ASYNC ', gw.CS_ASYNC_URL_UNREACHABLE])
    table.add_row([
        'COUNT. TRY/CATCHED SEMA EXCEP. in ASYNC ',
        gw.CS_ASYNC_SEMA_EXCEPTION_ERR
    ])
    pc.printErr(table)
    table.add_row([
        'TIME TAKEN FOR URL SCRAPING-r (min) ',
        round((endTime - startTime) / 60, 2)
    ])
    print("\n")
    pc.printWarn(
        '\t\t\t------------------------->>>>>> [ TimeTaken for All {} Sync Scraping (min) = {} ]\n'
        .format(gw.ASYNC_SERIES_CONNECTION,
                round((endTime - startTime), 5) / 60))
    print("\n\n")

Exemple #19

0

Afficher le fichier

Fichier : content_scraper.py Projet : aayush4vedi/CynicalReader

def ContentFormatting(ts):
    """ 
    Do:
        0. Update Content & WeightedContent column for each row
        1. get url_strings_content = getUrlString(row[13]) -> add it in weighted_content
        2. do clean_text(row[13])
        2. do clean_text(row[12])
        3. clean text clean_text(row[5]) -> add it in weighted_content :: clean_text(row[12]) + " " + clean_title + " " + url_strings_content
        4. if content col is still null; put title into it & in weightedContent too
    """

    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    conn = sqlite3.connect(wc_db)
    c = conn.cursor()
    pc.printMsg(
        "\t -------------------------------------- < Content Formatter: DB/wc Connection Opened > ---------------------------------------------\n"
    )
    startTime = time.time()
    pc.printWarn("\tRunning ContentFormatter for wc ....... \t NOW: {}".format(
        time.strftime("%H:%M:%S", time.localtime())))
    pc.printWarn(
        "\t\t. .  .  .  .  .  .  .  .  .  .  .......... Content Formatting Started @Content_Scraper ...........  .  .  .  .  .  .  .  .  .  .  ."
    )

    signal.signal(signal.SIGALRM,
                  timeout_handler)  # timeouts on few function calls, see below
    q = "select * from " + wc_table
    rows_head = c.execute(q)
    rows = rows_head.fetchall()
    conn.commit()
    for row in rows:
        t1 = time.time()
        row_list = list(row)
        if (len(row[13]) != 0):
            gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_OK += 1
            clean_title = clean_text(row_list[5])
            if len(row_list[13]) == 0:
                pc.printWarn(
                    "\t\t\t\t --------- No content found on cleaning, using Title as Content :("
                )
                row_list[13] = clean_title
                row_list[12] = clean_title
            else:
                raw_content = row_list[13]
                signal.alarm(200)  # Timeout of 200 sec on function call
                content = clean_title  # if timeout happens, this will be the value of content
                try:
                    content = text_actions.contentfromhtml(raw_content)
                except Exception as exc:
                    pc.printErr(
                        "\t <ID = {}><src= {} > Timeout of 200 sec happened on CONTENT@ContentFromHtml ! ....using Title as content "
                        .format(row[0], row[1]))
                    # pc.printWarn(exc)
                    pass

                signal.alarm(200)  # Timeout of 200 sec on function call
                clean_content = clean_title  # if timeout happens, this will be the value of content
                try:
                    clean_content = clean_text(content)
                except Exception as exc:
                    pc.printErr(
                        "\t <ID = {}><src= {} > Timeout of 200 sec happened on CONTENT@CleanText ! ....using Title as content "
                        .format(row[0], row[1]))
                    # pc.printWarn(exc)
                    pass

                signal.alarm(200)  # Timeout of 200 sec on function call
                weighted_content = clean_title  # if timeout happens, this will be the value of content
                try:
                    weighted_content = text_actions.weightedcontentfromhtml(
                        raw_content)
                except Exception as exc:
                    pc.printErr(
                        "\t <ID = {}><src= {} > Timeout of 200 sec happened on WEIGHTED_CONTENT@WeightedContentFromHtml ! ....using Title as weightedcontent "
                        .format(row[0], row[1]))
                    # pc.printWarn(exc)
                    pass

                signal.alarm(200)  # Timeout of 200 sec on function call
                clean_weighted_content = clean_title  # if timeout happens, this will be the value of content
                try:
                    clean_weighted_content = clean_text(weighted_content)
                except Exception as exc:
                    pc.printErr(
                        "\t <ID = {}><src= {} > Timeout of 200 sec happened on WEIGHTED_CONTENT@CleanText ! ....using Title as weightedcontent "
                        .format(row[0], row[1]))
                    # pc.printWarn(exc)
                    pass

                signal.alarm(200)  # Timeout of 200 sec on function call
                url_string_text = ''  # if timeout happens, this will be the value of content
                try:
                    url_string_text = getUrlString(raw_content)
                except Exception as exc:
                    pc.printErr(
                        "\t <ID = {}><src= {} > Timeout of 200 sec happened on URL_STRING@getUrlString ! ....using empty str as url_string_text "
                        .format(row[0], row[1]))
                    # pc.printWarn(exc)
                    pass

                row_list[13] = clean_content
                row_list[
                    12] = clean_weighted_content + " " + url_string_text + " " + clean_title

            row = tuple(row_list)

            pc.printWarn(
                "\t <ID = {}><src= {} > [Content Formatting] Done................ \t\t TimeTaken = {} \t NOW: {}"
                .format(row[0], row[1], round((time.time() - t1), 5),
                        time.strftime("%H:%M:%S", time.localtime())))
            content = row[13]
            q = 'update ' + wc_table + ' set Content = ?, WeightedContent = ?  where ID = ? and SourceSite = ?'
            d = (row[13], row[12], row[0], row[1])
            c.execute(q, d)
            conn.commit()
            # pc.printSucc(" \t\t ============== <ID= {} ><{}> [Content Formatting]-with content INSERTED INTO TABLE =============== ".format(row[0],row[1]))
        else:  #No content
            gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_NO_CONTENT += 1
            pc.printMsg(
                "\t <ID = {}><src= {} > [Content Formatting] No content.Using title finally................ \t\t TimeTaken = {} \t NOW: {}"
                .format(row[0], row[1], round((time.time() - t1), 5),
                        time.strftime("%H:%M:%S", time.localtime())))
            clean_title = clean_text(row_list[5])
            content = clean_title
            q = 'update ' + wc_table + ' set Content = ?, WeightedContent = ?  where ID = ? and SourceSite = ?'
            d = (content, content, row[0], row[1])
            c.execute(q, d)
            conn.commit()
            # pc.printSucc(" \t\t ============== <ID= {} ><{}> [Content Formatting]-without content INSERTED INTO TABLE =============== ".format(row[0],row[1]))
    endTime = time.time()

    conn.close()
    pc.printMsg(
        "\t -------------------------------------- < Content Formatter: DB/wc Connection Closed > ---------------------------------------------\n"
    )

    pc.printSucc(
        "\n\n***************************** Content Formatting is Complete. TABLE: {} ******************"
        .format(wc_table))
    print("\n\n")
    table = PrettyTable(
        ['Success (Post Content Formatting)', 'Notation(if any)', 'Value'])
    table.add_row([
        'IN : gw.WC_TOTAL_URL_ENTRIES ', '[X] (A+B+C=X)',
        gw.WC_TOTAL_URL_ENTRIES
    ])
    table.add_row([
        'OUT : ITEMS PUT IN WITH SCRAPED CONTENT', '[P] (P+Q=X)',
        gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_OK
    ])
    table.add_row([
        'OUT : x--ITEMS PUT IN WITH TITLE AS CONTENT--x', '[Q] (P+Q=X)',
        gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_NO_CONTENT
    ])
    table.add_row([
        'TIME TAKEN - CONTENT FORMATTING (min)', '-',
        round((endTime - startTime) / 60, 5)
    ])
    pc.printSucc(table)

    print("\n")
    pc.printWarn(
        '\t\t\t------------------------->>>>>> [ TimeTaken for Content Formatting (min) = {} ]\n'
        .format(round((endTime - startTime), 5) / 60))
    print("\n\n")

Exemple #20

0

Afficher le fichier

Fichier : content_scraper.py Projet : aayush4vedi/CynicalReader

def RunSync(ts):
    """
        NOTE: pdf pages taking a lot of time.Is it right to scrape them still?
    """
    startTime = time.time()
    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    conn = sqlite3.connect(wc_db)
    c = conn.cursor()
    pc.printMsg(
        "\t -------------------------------------- < CONTENT_SCRAPER_SYNC: DB/wc Connection Opened > ---------------------------------------------\n"
    )

    blob_pages = ['.jpg', '.png', '.gif', '.mp3', '.mp4']

    q = "select * from " + wc_table + " where length(Content) = 0"
    rows_head = c.execute(q)
    rows = rows_head.fetchall()
    pc.printMsg(
        "\n\n \t ******************************* ITEMS FOR SYNC TO SCRAPE = {} ******************************\n\n"
        .format(len(rows)))
    conn.commit()
    for row in rows:
        t1 = time.time()
        if (len(row[13]) == 0):
            try:
                if row[6][-4:] not in blob_pages:
                    response = web_requests.hitGetWithRetry(
                        row[6], '', False, 2, 0.5, 30)
                    if response != -1:
                        gw.CS_SYNC_ITEM_SCRAPED += 1
                        res = response.text
                        row_list = list(row)
                        row_list[13] = res
                        row = tuple(row_list)

                        pc.printWarn(
                            "\t <ID = {}><src= {} > [SYNCED SCRAPED] Done................ \t\t TimeTaken = {} \t NOW: {} "
                            .format(
                                row[0], row[1], round((time.time() - t1), 5),
                                time.strftime("%H:%M:%S", time.localtime())))
                        q = 'update ' + wc_table + ' set Content = ? where ID = ? and SourceSite = ?'
                        d = (row[13], row[0], row[1])
                        c.execute(q, d)
                        conn.commit()
                        # pc.printSucc(" \t\t ============== <ID= {} ><{}> [SYNCED SCRAPED] INSERTED INTO TABLE =============== ".format(row[0],row[1]))
                    else:
                        gw.CS_SYNC_URL_UNREACHABLE += 1
                        pc.printErr(
                            "\t\tXXXXXXXXX [SYNCED SCRAPED]\t SKIPPING... <ID: {}> Totally unable to hit url even in SYNC: {}  \t\t TimeTaken = {} \t NOW: {} "
                            .format(
                                row[0], row[6], round((time.time() - t1), 5),
                                time.strftime("%H:%M:%S", time.localtime())))
                else:
                    pc.printMsg(
                        "\t\txxxxx [SYNCED SCRAPED]\t... for ID: {} Found BLOB page SYNC. Will use title. URL: {}  \t\t TimeTaken = {} \t NOW: {} "
                        .format(row[0], row[6], round((time.time() - t1), 5),
                                time.strftime("%H:%M:%S", time.localtime())))
            except Exception as e:
                gw.CS_SYNC_TRIED_CATCH_EXCEPTION_ERR += 1
                pc.printErr(
                    "\t XXXXXXXXXXXXXX [SYNC SCRAPING] XXXX ==>> <ID = {}><src= {} > NOW = {} , \t\t TimeTaken = {} ....Sync Scraping failed too.Will use Title for content... \n \t\t ERROR=> {}"
                    .format(row[0], row[1],
                            time.strftime("%H:%M:%S", time.localtime()),
                            round((time.time() - t1), 5), e))
                # logging.error(traceback.format_exc())
                pass
    endTime = time.time()
    conn.close()
    pc.printMsg(
        "\t -------------------------------------- < CONTENT_SCRAPER_SYNC: DB/wc Connection Closed > ---------------------------------------------\n"
    )

    pc.printSucc(
        "\n\n***************************** Sync Content Scraping is Complete. TABLE: {} ******************"
        .format(wc_table))
    print("\n\n")
    table = PrettyTable(
        ['Success (Post Sync Content Scraping)', 'Notation(if any)', 'Value'])
    table.add_row([
        'IN : gw.WC_TOTAL_URL_ENTRIES ', '[X] (A+B+C=X)',
        gw.WC_TOTAL_URL_ENTRIES
    ])
    table.add_row([
        'OUT : ITEMS SCRAPED WITH SYNC', '[C] (A+B+C=X)',
        gw.CS_SYNC_ITEM_SCRAPED
    ])
    table.add_row([
        'TIME TAKEN - SYNC CONTENT SCRAPING (min)', '-',
        round((endTime - startTime) / 60, 5)
    ])
    pc.printSucc(table)

    pc.printErr(
        "------------------------------------------ ERRORS-SYNC (Written nonetheless, chill) ------------------------------------------------\n"
    )
    table = PrettyTable(['Failures (Post Sync Content Scraping)', 'Value'])
    table.add_row(
        ['COUNT. UNREACHABLE URLS - SYNC ', gw.CS_SYNC_URL_UNREACHABLE])
    table.add_row([
        'COUNT. TRY/CATCHED EXCEP. - SYNC ',
        gw.CS_SYNC_TRIED_CATCH_EXCEPTION_ERR
    ])
    pc.printErr(table)
    print("\n")
    pc.printWarn(
        '\t\t\t------------------------->>>>>> [ TimeTaken for Sync Scraping (min) = {} ]\n'
        .format(round((endTime - startTime), 5) / 60))
    print("\n\n")

Exemple #21

0

Afficher le fichier

Fichier : content_scraper_one_point_o.py Projet : aayush4vedi/CynicalReader

async def fetchWithRetry(row, session):
    """
        Hits ulr(with retires):
        * if status == 200:
            return resposne ((raw)Content & (raw)WeightedContent in row)
        * if still unable to hit after retries: Content = Title , WeightedContent = Title
        INPUT: `row` is an array with indices: 
            ID(0),SourceSite(1),ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5),Url(6),
            SourceTags(7),ModelTags(8),NumUpvotes(9),NumComments(10),PopI(11),WeightedContent(12),Content(13)
    """

    status = 400
    retry_cnt = 2
    sleep_time = 5
    # TIMEOUT = ClientTimeout(total=20)
    TIMEOUT = 20

    while retry_cnt > 0 and status != 200:
        async with session.get(row[6],
                               ssl=ssl.create_default_context(
                                   purpose=ssl.Purpose.CLIENT_AUTH),
                               timeout=TIMEOUT) as response:
            res = await response.text()
            # res = await response.content.read()
            # res = await text_actions.clean_text(str(response.content.read()))
            res = text_actions.clean_text(str(res))
            # res = res.encode('utf8', 'ignore').decode('utf8', 'ignore')                   #FIXME: not working
            status = response.status
            if (status == 200 and len(res) != 0):
                pc.printSucc(
                    "\t\t <ID = {}><src= {} > ============== #Scraped ....... \t NOW: {}"
                    .format(row[0], row[1],
                            time.strftime("%H:%M:%S", time.localtime())))
                row_list = list(row)
                row_list[12] = text_actions.weightedcontentfromhtml(res)
                row_list[13] = text_actions.contentfromhtml(res)
                # for i in range(len(row_list)):
                #     row_list[i] = row_list[i].decode("utf-8", "ignore")

                row = tuple(row_list)
                # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row[0],row[1],time.strftime("%H:%M:%S", time.localtime())))
                # time.sleep(0.001)
                if (len(row[13]) == 0):
                    global ERR_ASYNC_NO_CONTENT_IN_SCRAPING
                    ERR_ASYNC_NO_CONTENT_IN_SCRAPING += 1
                    pc.printErr(
                        "\t\t xxxxxxxxxxxxxxxxxxx SKIPPING  for <ID = {}><src= {} > As No Content even after scraping xxxxxxxxxxxxxxxxxxxxxxxx\n"
                        .format(row[0], row[1]))
                return row
            else:
                retry_cnt -= 1
                pc.printWarn(
                    "\t x---------------- <ID = {}><src= {} > Unable to hit URL(ERR_CODE={}): {}.........  Sleeping for {} Retries remaining = {} -------------x"
                    .format(row[0], row[1], status, row[6][:25], sleep_time,
                            retry_cnt))
                await asyncio.sleep(sleep_time)

    pc.printErr(
        "\t\txxxxx  For <ID = {}><src= {} >Totally unable to hit url.... using Title for Content & WeightedContent : {} "
        .format(row[0], row[1], row[6]))
    global ERR_ASYNC_ON_URL_ERROR
    ERR_ASYNC_ON_URL_ERROR += 1
    pc.printMsg(
        " \t\t\t ============== [Unreachable URL] Will write anyways. <ID = {}><src= {} > =============== "
        .format(row[0], row[1]))
    return row

Exemple #22

0

Afficher le fichier

Fichier : content_scraper.py Projet : aayush4vedi/CynicalReader

async def fetchWithRetry(conn, row, session, series_count, ts):
    """
        Hits ulr(with retires):
        * if status == 200:
            return resposne ((raw)Content & (raw)WeightedContent in row)
        * if still unable to hit after retries: Content = Title , WeightedContent = Title
        INPUT: `row` is an array with indices: 
            ID(0),SourceSite(1),ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5),Url(6),
            SourceTags(7),ModelTags(8),NumUpvotes(9),NumComments(10),PopI(11),WeightedContent(12),Content(13)
    """

    status = 400
    retry_cnt = 2
    sleep_time = 0.1

    t1 = time.time()
    while retry_cnt > 0 and status != 200:
        async with session.get(row[6],
                               ssl=ssl.create_default_context(
                                   purpose=ssl.Purpose.CLIENT_AUTH),
                               timeout=gw.CS_ASYNC_REQ_TIMEOUT) as response:
            # res = await response.content.read()       # returns blob which gives error while ContentFormatter; hence discarded
            res = await response.text()
            status = response.status
            if (status == 200 and len(res) != 0):
                gw.CS_ASYNC_ITEM_SCRAPED += 1
                gw.CS_BOYS_STILL_PLAYING -= 1
                pc.printSucc(
                    "\t\t <ID = {}><src= {} > ============== [ASYNCED SCRAPED#{}] Done ....... \t\t TimeTaken = {} \t NOW: {}"
                    .format(row[0], row[1], series_count,
                            round((round((time.time() - t1), 5)), 5),
                            time.strftime("%H:%M:%S", time.localtime())))
                row_list = list(row)
                row_list[13] = res
                row = tuple(row_list)

                # wc_db = 'dbs/wc.db'
                wc_table = 'wc_' + str(int(ts))
                # conn = sqlite3.connect(wc_db)
                # gw.SQL_CONN_OPEN += 1
                try:
                    c = conn.cursor()
                    q = 'update ' + wc_table + ' set Content = ? where ID = ? and SourceSite = ?'
                    d = (row[13], row[0], row[1])
                    c.execute(q, d)
                    pc.printWarn(
                        " \t\t ============== <ID= {} ><{}> [ASYNC ContentScraped] \t INSERTED INTO TABLE :: gw.SQL_CONN_OPEN = {} =============== "
                        .format(row[0], row[1], gw.SQL_CONN_OPEN))
                    conn.commit()
                except Exception as e:
                    pc.printMsg(
                        " \t\t === XXXX ====== <ID= {} ><{}> [ASYNC ContentScraped] \t ERRR in INSERTED INTO TABLE :: gw.SQL_CONN_OPEN = {} =============== "
                        .format(row[0], row[1], gw.SQL_CONN_OPEN))
                    logging.error(traceback.format_exc())
                    pass
                # conn.close()
                # gw.SQL_CONN_OPEN -= 1

                return row
            else:
                retry_cnt -= 1
                pc.printWarn(
                    "\t x---------------- <ID = {}><src= {} > Unable to hit URL(ERR_CODE={}): {}.........  Sleeping for {} Retries remaining = {} -------------x"
                    .format(row[0], row[1], status, row[6][:25], sleep_time,
                            retry_cnt))
                await asyncio.sleep(sleep_time)
    if series_count == gw.ASYNC_SERIES_CONNECTION:
        gw.CS_ASYNC_URL_UNREACHABLE += 1
        pc.printErr(
            "\t\txxxxx  For <ID = {}><src= {} >Totally unable to hit url.... Will try sync later: {} \t\t TimeTaken = {} \t NOW: {}"
            .format(row[0], row[1], row[6], round((time.time() - t1), 5),
                    time.strftime("%H:%M:%S", time.localtime())))
    # return row
    return []

Exemple #23

0

Afficher le fichier

def RunSync(ts):
    """
        Pick wc-db's table mapped with `ts` and scrapes (useful) "clean" Content & WeightedContent from url.
        * NOTE:
            * If conent is already present in the table, "clean" it too & append the newly scraped content to it.
            * FIRST RUN: time = 17 hours, data = 12 MB, #entries = 6.5k
        Input: ts (format: 1598692058.887741)
    """
    pc.printMsg(
        '@[{}] >>>>>> Started Content-scraper(SYNC) ................... => FILENAME: {}\n'
        .format(datetime.fromtimestamp(ts),
                'dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv'))

    csv_src_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str(
        int(ts)) + '.csv'
    csv_dest_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str(
        int(ts)) + '_wc_sync.csv'
    index = 1
    headers = [
        'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch',
        'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags',
        'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content'
    ]
    csv_functions.creteCsvFile(csv_dest_file, headers)

    f = csv.writer(open(csv_dest_file, "w"))  # Flush the old file
    f.writerow([
        'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch',
        'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags',
        'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content'
    ])
    with open(csv_src_file, mode='r') as csvfile:
        csv_reader = csv.DictReader(csvfile)
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                print(f'Headers are {", ".join(row)}')
                line_count += 1
            #CHECK1(pre scraping): if (content != NULL) => no scraping, just put it in as is
            if (len(row["Content"]) != 0):
                pc.printWarn(
                    "\t <ID = {} > [NO SCRAPING] Content already exists....putting as it is............. NOW: {}"
                    .format(row["ID"],
                            time.strftime("%H:%M:%S", time.localtime())))
                entry = [
                    row["ID"],
                    row["SourceSite"],
                    row["ProcessingDate"],
                    row["ProcessingEpoch"],
                    row["CreationDate"],
                    row["Title"],
                    row["Url"],
                    row["SourceTags"],
                    row["ModelTags"],
                    row["NumUpvotes"],
                    row["NumComments"],
                    row["PopI"],
                    text_actions.clean_text(row["Title"] +
                                            row["WeightedContent"]) +
                    text_actions.getUrlString(
                        row["Content"]),  #add the url-words too
                    text_actions.clean_text(row["Content"]) +
                    text_actions.getUrlString(row["Content"])
                ]
                global WRITTEN_ENTRIES_SYNC
                WRITTEN_ENTRIES_SYNC += 1
                f = csv.writer(open(csv_dest_file, "a"))
                f.writerow(entry)
            #CHECK2(pre scraping): if(url == NULL)=>discard
            #CHECK3(pre scraping): if (row["title"]==NULL)=>discard
            elif ((len(row["Url"]) != 0) and (len(row["Title"]) != 0)):
                pc.printWarn(
                    "\t <ID = {} > [SCRAPING BEGIN] sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}"
                    .format(row["ID"],
                            time.strftime("%H:%M:%S", time.localtime())))
                time.sleep(0.0001)
                try:
                    # response = web_requests.hitGetWithRetry(url,TIMEOUT=10)
                    response = web_requests.hitGetWithRetry(
                        row["Url"], '', False, 2, 0.5, 60)
                    # if response.status_code == 200:
                    if response != -1:
                        # content = text_actions.contentfromhtml(response)  #NOTE: for sync
                        content = text_actions.contentfromhtml(
                            response.text)  #NOTE: for Async
                        urlstrings = text_actions.getUrlString(content)
                        content += urlstrings  #add the url-words too
                        # weightedcontent = text_actions.weightedcontentfromhtml(response.text) + row["Title"] + urlstrings #add the url-words too      #NOTE: for sync
                        weightedcontent = text_actions.weightedcontentfromhtml(
                            response.text
                        ) + row[
                            "Title"] + urlstrings  #add the url-words too        #NOTE: for async
                        line_count += 1
                        #CHECK1(post scraping): if (content == null)&&(row["Title"] != null)<already checked abouve>=> row["Content"] = clean_text(row["title"]) AND row["weightedContent"] = clean_text(row["title"])
                        if (len(content) == 0):
                            content = row["Title"]
                            weightedcontent = row["Title"]
                        else:
                            entry = [
                                row["ID"], row["SourceSite"],
                                row["ProcessingDate"], row["ProcessingEpoch"],
                                row["CreationDate"], row["Title"], row["Url"],
                                row["SourceTags"], row["ModelTags"],
                                row["NumUpvotes"], row["NumComments"],
                                row["PopI"],
                                text_actions.clean_text(weightedcontent),
                                text_actions.clean_text(content)
                            ]

                        f = csv.writer(open(csv_dest_file, "a"))
                        f.writerow(entry)
                        pc.printMsg(
                            "\t\t <ID = {} > ============== Scraping Done....... \t NOW: {}"
                            .format(
                                row["ID"],
                                time.strftime("%H:%M:%S", time.localtime())))
                    else:
                        global SKIPPED_SYNC
                        SKIPPED_SYNC += 1
                        pc.printErr(
                            "\t\txxxxx SKIPPING... for ID: {} Unable to hit url: {} , "
                            .format(row["ID"], row["Url"]))
                except Exception as e:
                    global FAILED_SYNC
                    FAILED_SYNC += 1
                    pc.printErr(
                        "\t======= XXXXXXXX ERROR XXXXXX ======>> ID= {} NOW = {} Skipping...Failed due to: \n \t\t ERROR {}"
                        .format(row["ID"],
                                time.strftime("%H:%M:%S", time.localtime()),
                                e))
                    pass
    pc.printMsg(
        "\n****************** Content Scraping is Complete , FILENAME: {} ********************\n"
        .format('dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv'))
    pc.printMsg(
        "\n----------------------------------------------------------------------------------\n"
    )
    pc.printMsg(
        "|\tWRITTEN_ENTRIES_SYNC \t  | \t {} \t|".format(WRITTEN_ENTRIES_SYNC))
    pc.printMsg("|\tSKIPPED_SYNC          \t | \t {} \t|".format(SKIPPED_SYNC))
    pc.printMsg("|\tFAILED_SYNC           \t | \t {} \t|".format(FAILED_SYNC))
    pc.printMsg(
        "\n----------------------------------------------------------------------------------\n"
    )

Exemple #24

0

Afficher le fichier

def run(ts):
    """
        Get top 1000 submissions of the listed subreddits (max_limit is 1000; should be enough)
        Hence no use of `ts` here
    """
    startTime = time.time()
    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    pc.printSucc(
        '@[{}] >>>>>> Started r-scraper ................... => TABLE: {}\n'.
        format(datetime.fromtimestamp(ts), wc_table))
    pc.printMsg(
        "\t -------------------------------------- < r_SCRAPER: DB/wc Connection Opened > ---------------------------------------------\n"
    )
    conn = sqlite3.connect(wc_db, timeout=10)
    c = conn.cursor()

    blob_pages = ['.jpg', '.png', '.gif', '.mp3',
                  '.mp4']  # these give blob data; no point in scraping them

    index = gw.WC_TOTAL_URL_ENTRIES + 1

    # Setup Client
    reddit = praw.Reddit(
        client_id=vault.R_CLIENT_ID,  # PERSONAL_USE_SCRIPT_14_CHARS
        client_secret=vault.R_CLIENT_SECRET,  # SECRET_KEY_27_CHARS
        user_agent=vault.R_USER_AGENT,  # YOUR_APP_NAME
        username=vault.R_USERNAME,  # YOUR_REDDIT_USER_NAME
        password=vault.R_PASSWORD)  # YOUR_REDDIT_LOGIN_PASSWORD

    for subreddit, tag_arr in LIST.items():
        try:
            pc.printWarn(
                "\t ............  Subreddit@R_UrlScraping : {}  .............".
                format(subreddit))
            sr = reddit.subreddit(subreddit)
            # for submission in sr.top('day',limit=10):                   # For testing....
            # for submission in sr.top('year',limit=1000):                #remove this & uncomemnt below line
            ENTRIES_IN_THIS_SUBRDDIT = 0
            for submission in sr.top('week',
                                     limit=gw.R_ITEM_LIMIT_PER_SUBREDDIT
                                     ):  #NOTE: max limit is 1000
                #Check1: if the post is unlocked by mods
                content = ''
                """ Fixing permalink type urls """
                url = submission.url
                if (url[:2] == '/r'):
                    url = "https://www.reddit.com" + url
                if (submission.locked == False):
                    #Check2: if post is just an image, discard it
                    if submission.url[
                            -4:] not in blob_pages:  #as reddit currentluy hosts .png & .gif only
                        # if permalink is a substring of url OR submission is a selfpost (text-only) => no need to scrape
                        # NOTE: I know there might be links in post with some discription+link to other article he's reffering; but not worth wasting precious processing time
                        if ((submission.permalink in submission.url)
                                or (submission.is_self == True)):
                            content = submission.selftext
                        entry = [
                            index, "r/" + subreddit,
                            datetime.fromtimestamp(ts).date(),
                            int(ts),
                            date_conversion.RedditDate(
                                str(datetime.fromtimestamp(
                                    submission.created))), submission.title,
                            url,
                            json.dumps(tag_arr), '', submission.score,
                            submission.num_comments, '', '',
                            text_actions.clean_text(content)
                        ]
                        # csv_functions.putToCsv(csv_file,entry)
                        c.execute(
                            'INSERT INTO ' + wc_table +
                            ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
                        index += 1
                        ENTRIES_IN_THIS_SUBRDDIT += 1
            gw.R_TOTAL_ITEMS_GOT_YET += ENTRIES_IN_THIS_SUBRDDIT
            pc.printMsg(
                "\t\t\t\t\t ====> ENTRIES_IN_THIS_SUBRDDIT = {} \t\t |  \t gw.R_TOTAL_ITEMS_GOT_YET = {}"
                .format(ENTRIES_IN_THIS_SUBRDDIT, gw.R_TOTAL_ITEMS_GOT_YET))
        except Exception as e:
            pc.printErr(
                " \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n"
                .format(index, e))
            logging.error(traceback.format_exc())
            pass

    endTime = time.time()
    gw.WC_TOTAL_URL_ENTRIES += gw.R_TOTAL_ITEMS_GOT_YET

    conn.commit()
    conn.close()
    pc.printMsg(
        "\t -------------------------------------- < r_SCRAPER: DB/wc Connection Closed > ---------------------------------------------\n"
    )
    pc.printSucc(
        "\n\n***************************** Reddit Url Scraping is Complete. TABLE: {} ******************"
        .format(wc_table))
    print("\n\n")
    table = PrettyTable(['Entity (Post r URL Scraping)', 'Value'])
    table.add_row(['TOTAL URLS FETCHED by HN', gw.R_TOTAL_ITEMS_GOT_YET])
    table.add_row(['TOTAL ITEMS IN WC TABLE YET', gw.WC_TOTAL_URL_ENTRIES])
    table.add_row([
        'TIME TAKEN FOR URL SCRAPING-r (min) ',
        round((endTime - startTime) / 60, 2)
    ])
    pc.printSucc(table)
    print("\n\n")

Exemple #25

0

Afficher le fichier

def run(ts):
    """ I. Creates wc_table(in wc.db) & wp_table(in wp.dp) for the week
        II. Runs following scrapers serially and updates them in WC-DB:
            1. hn_scraper.py
            2. r_scraper.py
            4. ph_scraper.py => Api exists, Scraping not allowed(doint it anyway)
            3. ih_scraper.py => No Api, Scraping not allowed(postponed for later)

        Input: float(timestamp) - set when the main.py run is triggered
            * float because o/w `datetime.fromtimestamp(ts)` wont run on int
        Outpu: None, just put data in WC-DB
    """
    startTime = time.time()
    """ Initialize the weekly content tables in wc.db and wp.db"""

    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    conn = sqlite3.connect(wc_db, timeout=10)
    c = conn.cursor()
    c.execute(
        "SELECT count(name) FROM sqlite_master WHERE type='table' AND name='{}'"
        .format(wc_table))
    if c.fetchone()[0] == 1:  # table exists, flush away!
        c.execute("delete from {}".format(wc_table))
    else:  # creting new table
        c.execute(
            "CREATE TABLE {} (ID, SourceSite, ProcessingDate,ProcessingEpoch,CreationDate, Title, Url, SourceTags,ModelTags,NumUpvotes, NumComments, PopI,WeightedContent,Content)"
            .format(wc_table))

    pc.printSucc(
        "\n**************************************************** wc_table created => {} **************************************************** \n"
        .format(wc_table))

    wp_db = 'dbs/wp.db'
    wp_table = 'wp_' + str(int(ts))
    conn = sqlite3.connect(wp_db, timeout=10)
    c = conn.cursor()
    c.execute(
        "SELECT count(name) FROM sqlite_master WHERE type='table' AND name='{}'"
        .format(wp_table))
    if c.fetchone()[0] == 1:  # table exists, flush away!
        c.execute("delete from {}".format(wc_table))
    else:  # creting new table
        c.execute('''CREATE TABLE {}
                (ID, SourceSite, ProcessingDate,ProcessingEpoch,CreationDate, Title, Url, ThumbnailUrl,SourceTags,NumUpvotes, NumComments, PopI,Content)'''
                  .format(wp_table))

    pc.printSucc(
        "\n**************************************************** wp_table created => {} **************************************************** \n"
        .format(wp_table))
    """ Run the scrapers sequentially """
    pc.printWarn(
        ".   .   .   .   .   .   .   .   .   .   .   .   .   .   .   ...... Started Running all the scrapers ......    .   .   .   .   .   .   .   .   .   .   .   .   .   .   .\n"
    )

    try:
        hn_scraper.run(ts)
        pc.printSucc(
            "\n================ HH url scraper run: Complete ================\n"
        )
    except Exception as e:
        pc.printErr(
            " xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running Url Scraper-HN xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\t>>> Error = {}"
            .format(str(e)))
        logging.error(traceback.format_exc())
        pass

    try:
        r_scraper.run(ts)
        pc.printSucc(
            " \n================ Reddit url scraper run: Complete ================\n"
        )
    except Exception as e:
        pc.printErr(
            " xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running Url Scraper-Reddit xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\tError = {}"
            .format(str(e)))
        logging.error(traceback.format_exc())
        pass

    try:
        ph_scraper.run(ts)
        pc.printSucc(
            " \n================ PH url scraper run: Complete ================\n"
        )
    except Exception as e:
        pc.printErr(
            " xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running Url Scraper-PH xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\tError = {}"
            .format(str(e)))
        logging.error(traceback.format_exc())
        pass

    # try:
    #     ih_scraper.run(ts)
    #     print(" \n====== IH url scraper run: Complete ======\n")
    # except Exception as e:
    #     print(" XXXXXXXXXXXX Error in scraping IH for url XXXXXXXXXXXXXXXXX \n \t\tError = {}".format(str(e)))
    #     pass

    #TODO: add Lobsters here

    endTime = time.time()
    pc.printSucc(
        " ********************************************** URL Scraping(HN,r,PH) is complete *******************************************\n"
    )
    print("\n\n")
    table = PrettyTable(['Entity (Post all URL Scraping)', 'Value'])
    table.add_row(['TOTAL URL ITEMS IN WC TABLE ', gw.WC_TOTAL_URL_ENTRIES])
    table.add_row([
        'TIME TAKEN FOR URL SCRAPING-All (min) ',
        round((endTime - startTime) / 60, 2)
    ])
    pc.printSucc(table)
    print("\n\n")

Exemple #26

0

Afficher le fichier

def run(ts):
    """
        This function does:
            * Creates the Tree Schema(germination)
            * Update Nodes(leaves & accumulated) with item_count(count) & avg_popi in schema iteself
            * Creates & updates th_table for given timestamp(ts)
    """
    """ create the tree """
    startTime = time.time()
    pc.printWarn(
        "\t\t .   .   .   .   .   .   .   .   .   ....... Tree Germination in progress .......    .   .   .   .   .   .   .   .   .\n"
    )
    root = TreeGermination()
    pc.printSucc(
        "\t\t <----------------------------------------------- Tree is Germinated ------------------------------------------------>\n"
    )
    """ update leafnodes """
    pc.printWarn(
        "\t\t .   .   .   .   .   .   .   .   .   ....... Updating Leaf(tag) Nodes.......    .   .   .   .   .   .   .   .   .\n"
    )
    updateLeafNodes(ts)
    pc.printSucc(
        "\t\t <--------------------------------------------- Leaf Nodes updated ------------------------------------------------>\n"
    )
    """ update parents """
    pc.printWarn(
        "\t\t .   .   .   .   .   .   .   .   .   ....... Updating Parent Nodes.......    .   .   .   .   .   .   .   .   .\n"
    )
    updateParentNodes(root)
    pc.printSucc(
        "\t\t <--------------------------------------------- Parent Nodes updated ------------------------------------------------>\n"
    )
    """ NOTE: Print the Tree if you want """
    tree_printer_pretty.print_tree(root)
    """ Create & Populate Tag Hotness(TH) Table"""
    pc.printWarn(
        "\t\t .   .   .   .   .   .   .   .   .   ....... Creating & Populating TH Table .......    .   .   .   .   .   .   .   .   .\n"
    )
    create_th(ts)
    update_th_mptt(root, 1, 1, ts)  # update_th_mptt(root,left,level,ts)
    pc.printSucc(
        "\t\t <--------------------------------------------- TH Table Created & Populated ------------------------------------------------>\n"
    )
    """ Update th_table for ItemIDs of wc_table """
    pc.printWarn(
        "\t\t .   .   .   .   .   .   .   .   .   ....... Updating th_table for ItemIDs from wc_table.......    .   .   .   .   .   .   .   .   .\n"
    )
    update_th_table_for_itemIDs(root, ts)
    pc.printSucc(
        "\t\t <--------------------------------------------- th_table now has ItemIDs(HN_IDs,R_IDs) from wc_table ------------------------------------------------>\n"
    )

    endTime = time.time()
    th_table = 'th_' + str(int(ts))
    pc.printWarn(
        "\t\t ---------------> TIME TAKEN FOR th_creating & th_updating@th (sec)   =>  {} => TABLE: {}\n"
        .format(round((endTime - startTime), 5), th_table))

Exemple #27

0

Afficher le fichier

Fichier : content_scraper_failed_csv.py Projet : aayush4vedi/CynicalReader

def cleanNcheckAsyncOutput(csv_in, csv_out):
    """
        Analyse the created & input fles 
        Also, cleans Content & WeightedContent-> put in new file, delete the old one
        Variables:
            * NO_LINES_IN_INPUT_CSV
            * NO_LINES_IN_OUTPUT_CSV
            * NO_LINES_IN_OUTPUT_WITHOUT_TITLE
            * NO_LINES_IN_OUTPUT_WITHOUT_URL
            * NO_LINES_IN_OUTPUT_WITHOUT_CONTENT
    """

    f = open(csv_in, "r+")
    f.fseek(0)  # reach to first line
    reader = csv.reader(f)
    NO_LINES_IN_INPUT_CSV = len(list(reader))
    """ Now check and create new "cleaned" file """

    headers = [
        'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch',
        'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags',
        'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content'
    ]
    csv_final_out = os.path.join("F", csv_out)
    csv_functions.creteCsvFile(csv_final_out, headers)

    pc.prCyan(
        " ========================== NOW CREATING FINAL OUTPUT FILE: {} ==========================="
        .format(csv_final_out))

    line_count = 0
    with open(csv_out, mode='r') as r, open(csv_final_out, 'a+',
                                            newline='') as f:
        reader = csv.DictReader(r)
        writer = csv.writer(f)
        NO_LINES_IN_OUTPUT_CSV = 0
        for row in reader:
            if (line_count == 0):  # skipping headers
                line_count += 1
            else:
                url_string_content = text_actions.getUrlString(row["Content"])
                content = text_actions.clean_text(row["Content"])
                weighted_content = text_actions.clean_text(
                    row["WeightedContent"])
                entry = [
                    row["ID"],
                    row["SourceSite"],
                    row["ProcessingDate"],
                    row["ProcessingEpoch"],
                    row["CreationDate"],
                    row["Title"],
                    row["Url"],
                    row["SourceTags"],
                    row["ModelTags"],
                    row["NumUpvotes"],
                    row["NumComments"],
                    row["PopI"],
                    weighted_content + url_string_content,
                    content,
                ]
                writer.writerow(entry)
                NO_LINES_IN_OUTPUT_CSV += 1
                if (len(row["Title"]) == 0):
                    NO_LINES_IN_OUTPUT_WITHOUT_TITLE += 1
                if (len(row["Url"]) == 0):
                    NO_LINES_IN_OUTPUT_WITHOUT_URL += 1
                if (len(row["Content"]) == 0):
                    NO_LINES_IN_OUTPUT_WITHOUT_CONTENT += 1

    #TODO:  os.remove(csv_in) %% rename

    pc.printWarn(
        "\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~ Analysis ~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
    )
    pc.printWarn(
        "|\t NO_LINES_IN_INPUT_CSV                 \t | \t  {}  \t|".format(
            NO_LINES_IN_INPUT_CSV))
    pc.printWarn(
        "|\t NO_LINES_IN_OUTPUT_CSV                \t | \t  {}  \t|".format(
            NO_LINES_IN_OUTPUT_CSV))
    pc.printWarn(
        "|\t NO_LINES_IN_OUTPUT_WITHOUT_TITLE      \t | \t  {}  \t|".format(
            NO_LINES_IN_OUTPUT_WITHOUT_TITLE))
    pc.printWarn(
        "|\t NO_LINES_IN_OUTPUT_WITHOUT_URL        \t | \t  {}  \t|".format(
            NO_LINES_IN_OUTPUT_WITHOUT_URL))
    pc.printWarn(
        "|\t NO_LINES_IN_OUTPUT_WITHOUT_CONTENT    \t | \t  {}  \t|".format(
            NO_LINES_IN_OUTPUT_WITHOUT_CONTENT))

Exemple #28

0

Afficher le fichier

Fichier : content_scraper_one_point_o.py Projet : aayush4vedi/CynicalReader

async def asyncFetchAll(ts):
    """
        INPUT: ts (format: 1598692058.887741)
    """
    global CONNTECTION_COUNT, SEMAPHORE_COUNT

    tasks = []
    sem = asyncio.Semaphore(SEMAPHORE_COUNT)

    #==========================init connection
    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    conn = sqlite3.connect(wc_db, timeout=10)
    c = conn.cursor()
    pc.printMsg(
        "\t -------------------------------------- < CONTENT_SCRAPER: DB Connection Opened > ---------------------------------------------\n"
    )
    stratTime = time.time()

    # """ Initialize the output file """
    # headers = ['ID', 'SourceSite', 'ProcessingDate','ProcessingEpoch','CreationDate', 'Title', 'Url', 'SourceTags','ModelTags','NumUpvotes', 'NumComments', 'PopI','WeightedContent','Content']
    # csv_functions.creteCsvFile(csv_out,headers)

    global ENTRIES_TO_BE_WRITTEN
    global WRITTEN_ENTRIES_ASYNC_SCRAPED
    global WRITTEN_ENTRIES_ASYNC_DIRECT
    global ASYNC_ENTRIES_TO_BE_SCRAPED

    connector = TCPConnector(limit=CONNTECTION_COUNT,
                             family=socket.AF_INET,
                             verify_ssl=False)
    # connector = TCPConnector(limit=CONNTECTION_COUNT)
    # connector = ProxyConnector.from_url('http://*****:*****@127.0.0.1:1080')
    async with ClientSession(headers={'Connection': 'keep-alive'},
                             connector=connector) as session:
        q = "select * from " + wc_table
        rows_head = c.execute(q)
        rows = rows_head.fetchall()
        for row in rows:
            """
                ============= row is an array with indices: 
                ID(0),SourceSite(1),ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5),Url(6),
                SourceTags(7),ModelTags(8),NumUpvotes(9),NumComments(10),PopI(11),WeightedContent(12),Content(13)
            """
            ENTRIES_TO_BE_WRITTEN += 1
            if (len(row[13]) != 0):
                pc.printWarn(
                    "\t <ID = {}><src= {} > [NO SCRAPING] Content already exists............... NOW: {}"
                    .format(row[0], row[1],
                            time.strftime("%H:%M:%S", time.localtime())))
                clean_content = row[13]  #Already cleaned in url_scraper
                url_strings_content = getUrlString(row[13])
                clean_title = clean_text(row[5])
                clean_weighted_content = clean_text(
                    row[12]) + " " + clean_title + " " + url_strings_content

                query = 'update ' + wc_table + ' set Content = ? , WeightedContent = ? where ID = ? and SourceSite = ?'
                data = (clean_content, clean_weighted_content, row[0], row[1])
                c.execute(query, data)
                WRITTEN_ENTRIES_ASYNC_DIRECT += 1
                pc.printSucc(
                    " \t\t ============== <ID= {} ><{}> [Direct] INSERTED INTO TABLE =============== "
                    .format(row[0], row[1]))
            elif (row[5] and row[6]):  # else ignore the entry
                ASYNC_ENTRIES_TO_BE_SCRAPED += 1
                print("\t\t\t\t\t SENT...... SENT_COUNT = {}".format(
                    ASYNC_ENTRIES_TO_BE_SCRAPED))
                # if(ASYNC_ENTRIES_TO_BE_SCRAPED%100 == 0):
                #     pc.printMsg("\t\t\t.......................zzzzzzzzzzzzzzzzzzzzzzzzzzzzzz <NAP TIME> for 5 sec After 100 async-requests while content scraping #ZarooriHaiJi zzzzzzzzzzzzzzz.......................")
                #     time.sleep(5)
                task = asyncio.ensure_future(
                    semaphoreSafeFetch(sem, row, session))
                tasks.append(task)

        responses = await asyncio.gather(*tasks)
        for row in responses:
            if row:
                clean_content = clean_text(row[13])
                url_strings_content = getUrlString(row[13])
                clean_title = clean_text(row[5])
                clean_weighted_content = clean_text(
                    row[12]) + " " + clean_title + " " + url_strings_content
                query = 'update ' + wc_table + ' set Content = ? , WeightedContent = ? where ID = ? and SourceSite = ?'
                data = (clean_content, clean_weighted_content, row[0], row[1])
                c.execute(query, data)
                WRITTEN_ENTRIES_ASYNC_SCRAPED += 1
                pc.printSucc(
                    " \t\t ============== <ID= {} ><{}> [Scraped] INSERTED INTO TABLE =============== "
                    .format(row[0], row[1]))

    endTime = time.time()
    conn.commit()
    conn.close()
    pc.printMsg(
        "\t -------------------------------------- < CONTENT_SCRAPER: DB Connection Closed > ---------------------------------------------\n"
    )