Python printErr Exemples, utilities.print_in_color.printErr Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : content_scraper_failed_csv.py Projet : aayush4vedi/CynicalReader

async def semaphoreSafeFetch(sem, row, session, csv_out):
    """
        Simple puts check for semaphore count
    """
    async with sem:
        try:
            return await fetchWithRetry(row, session, csv_out)
        except Exception as e:
            global FAILED_ASYNC
            FAILED_ASYNC += 1
            # This error is mainly because of:
            ## 1. [nodename nor servname provided, or not known]
            ## 2. [Too many open files]
            pc.printErr(
                "\t======= XXXXXXXX ERROR XXXXXX ======>> <ID = {}><src= {} > NOW = {} Scraping failed. Using Title for Content.... \n \t\t ERROR {}"
                .format(row["ID"], row["SourceSite"],
                        time.strftime("%H:%M:%S", time.localtime()), e))
            if len(row["Content"]) == 0:
                row["WeightedContent"] = row["Title"]
                row["Content"] = row["Title"]
            await write_result(csv_out, row)
            global WRITTEN_ENTRIES_ASYNC_TRIED_ERR
            WRITTEN_ENTRIES_ASYNC_TRIED_ERR += 1
            pc.printMsg(
                " \t\t\t============== [Tried Catch] Done Writing into csv for <ID = {}><src= {} > =============== "
                .format(row["ID"], row["SourceSite"]))
            pass
    return row  #NOTE: this f****r!!!

Exemple #2

0

Afficher le fichier

Fichier : popi_calculator.py Projet : aayush4vedi/CynicalReader

def run(ts):
    startTime = time.time()

    try:
        run_wc(ts)
    except Exception as e:
        pc.printErr(" xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running PopICalculator for wc table xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\t>>> Error = {}".format(str(e)))
        logging.error(traceback.format_exc())
        pass

    try:
        run_wp(ts)
    except Exception as e:
        pc.printErr(" xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running PopICalculator for wc table xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\t>>> Error = {}".format(str(e)))
        logging.error(traceback.format_exc())
        pass
    
    endTime = time.time()

    pc.printSucc("**************************** PopI Calculation is Done for wc & wp ********************************\n\n")
    pc.printWarn("| \t\t TIME TAKEN FOR PopICalculators-both     \t\t | \t\t {}  \t\t |".format(round((endTime - startTime),5)))
    pc.printSucc("*************************************************************************************************\n\n")

    pc.printSucc("\n\n***************************** PopI Calculation is Complete.************************")
    print("\n\n")
    table = PrettyTable(['Entity (Post PopI Calculation)', 'Value'])
    table.add_row(['TIME TAKEN FOR PopICalculators(wc & wp) (min)', round((endTime - startTime)/60,2)])
    pc.printSucc(table)
    print("\n\n")

Exemple #3

0

Afficher le fichier

def print_tree_horizontally(current_node,
                            balanced_branches,
                            name_getter,
                            indent='',
                            last='updown'):

    up, down = balanced_branches(current_node)

    item_len = len(current_node.name) + len(str(current_node.popi)) + len(
        str(current_node.count)) - 10
    if current_node.isTag == False:
        item_len += 2
    """ Printing of "up" branch. """
    for child in up:
        next_last = 'up' if up.index(child) == 0 else ''
        # next_indent = '{0}{1}{2}'.format(indent, ' ' if 'up' in last else '│', ' ' * (len(current_node.name)))
        # next_indent = '{0}{1}{2}'.format(indent, ' ' if 'up' in last else '│', ' ' * (item_len))
        next_indent = '{0}{1}{2}'.format(
            indent, ' ' * (item_len) if 'up' in last else '│',
            ' ' * (item_len))
        print_tree_horizontally(child, balanced_branches, name_getter,
                                next_indent, next_last)
    """ Printing of current node. """
    if last == 'up': start_shape = '┌'
    elif last == 'down': start_shape = '└'
    elif last == 'updown': start_shape = ' '
    else: start_shape = '├'

    if up: end_shape = '┤'
    elif down: end_shape = '┐'
    else: end_shape = ''

    # print('{0}{1}{2}{3}'.format(indent, start_shape, name_getter(current_node), end_shape))

    if current_node.isTag:
        pc.printMsg('{0}{1}<{2}>(c: {3},p: {4}){5}'.format(
            indent, start_shape, current_node.name, current_node.count,
            current_node.popi, end_shape))
    else:
        pc.printErr('{0}{1}[<{2}>](c: {3},p: {4}){5}'.format(
            indent, start_shape, current_node.name, current_node.count,
            current_node.popi, end_shape))
    """ Printing of "down" branch. """
    for child in down:
        next_last = 'down' if down.index(child) is len(down) - 1 else ''
        # next_indent = '{0}{1}{2}'.format(indent, ' ' if 'down' in last else '│', ' ' * (len(current_node.name)))
        # next_indent = '{0}{1}{2}'.format(indent, ' ' if 'down' in last else '│', ' ' * (item_len))
        next_indent = '{0}{1}{2}'.format(
            indent, ' ' * (item_len) if 'down' in last else '│',
            ' ' * (item_len))
        print_tree_horizontally(child, balanced_branches, name_getter,
                                next_indent, next_last)

Exemple #4

0

Afficher le fichier

async def semaphoreSafeFetch(sem, row, session):
    """
        Simple puts check for semaphore count
    """
    async with sem:
        try:
            return await fetchWithRetry(row, session)
        except Exception as e:
            global FAILED_ASYNC
            FAILED_ASYNC += 1
            pc.printErr(
                "\t======= XXXXXXXX ERROR XXXXXX ======>> <ID = {}><src= {} > NOW = {} Skipping...Failed due to: \n \t\t ERROR {}"
                .format(row["ID"], row["SourceSite"],
                        time.strftime("%H:%M:%S", time.localtime()), e))
            pass
    return row  #FIXME: this f****r!!!

Exemple #5

0

Afficher le fichier

Fichier : th_query.py Projet : aayush4vedi/CynicalReader

def return_all_descendents(ts, root):
    """
        Returns all the descendents of node in tag-tree where node.NodeName = root
    """

    pc.printMsg(
        " \t\t???????????????????????????????????? Query for All Descendents of NodeName = {}"
        .format(root))

    descendents = []
    th_db = 'dbs/th.db'
    th_table = 'th_' + str(int(ts))
    conn = sqlite3.connect(th_db, timeout=10)
    c = conn.cursor()
    pc.printMsg(
        "\t -------------------------------------- < query_children_th: DB Connection Opened > ---------------------------------------------\n"
    )

    q = 'select LeftMptt, RightMptt from ' + th_table + ' where NodeName = ?'
    root_mptt_values = c.execute(q, ('{}'.format(root), ))
    root_mptt_values = c.fetchone()
    if root_mptt_values is None:
        pc.printErr(
            " \t\tXXXXXXXXXXXXX-> Asked node with name = {} not found in table = {} \t...... returning NULL as descendents"
            .format(root, th_table))
        return descendents

    pc.printMsg(" root.LeftMptt = {} , root.RightMptt = {} \n".format(
        root_mptt_values[0], root_mptt_values[1]))

    q = 'select * from ' + th_table + ' where LeftMptt > ? AND RightMptt < ?'
    d = (root_mptt_values[0], root_mptt_values[1])
    rows_head = c.execute(q, d)
    rows = rows_head.fetchall()
    for row in rows:
        pc.printWarn(" \t\t * DESCENDENT of {} :: {}".format(root, row))
        descendents.append(row)

    conn.commit()
    conn.close()
    pc.printMsg(
        "\t -------------------------------------- < query_children_th: DB Connection Closed > ---------------------------------------------\n"
    )
    return descendents

Exemple #6

0

Afficher le fichier

Fichier : th_query.py Projet : aayush4vedi/CynicalReader

def return_imm_children(ts, root):
    """
        INPUT: ts, root.name (string)

        OUTPUT:
            Returns full row(ID, NodeName, LeftMptt, RightMptt, DepthLevel, ItemCount, AvgPopI, HN_IDs,R_IDs) 
            of just the immediate children of node in tag-tree where node.NodeName = root
    """

    # pc.printMsg(" \t\t ???????????????????????????????????? Query for Immediate Children of NodeName = {}".format(root))

    children = []
    th_db = 'dbs/th.db'
    th_table = 'th_' + str(int(ts))
    conn = sqlite3.connect(th_db, timeout=10)
    c = conn.cursor()
    # pc.printMsg("\t -------------------------------------- < query_children_th: DB Connection Opened > ---------------------------------------------\n")

    q = 'select LeftMptt, RightMptt, DepthLevel from ' + th_table + ' where NodeName = ? ;'
    root_mptt_values = c.execute(q, ('{}'.format(root), ))
    root_mptt_values = c.fetchone()
    if root_mptt_values is None:
        pc.printErr(
            " \t\tXXXXXXXXXXXXX-> Asked node with name = {} not found in table = {} \t...... returning NULL as children"
            .format(root, th_table))
        return children

    pc.printMsg(
        " \t ROOT: {} \troot.LeftMptt = {} , root.RightMptt = {} , root.DepthLevel = {}\n"
        .format(root, root_mptt_values[0], root_mptt_values[1],
                root_mptt_values[2]))

    q = 'select * from ' + th_table + ' where LeftMptt > ? AND RightMptt < ? And DepthLevel = ? '
    d = (root_mptt_values[0], root_mptt_values[1], root_mptt_values[2] + 1)
    rows_head = c.execute(q, d)
    rows = rows_head.fetchall()
    for row in rows:
        pc.printWarn(" \t\t *  CHILD of {} :: {}".format(root, row[1]))
        children.append(row)

    conn.commit()
    conn.close()
    # pc.printMsg("\t -------------------------------------- < query_children_th: DB Connection Closed > ---------------------------------------------\n")
    return children

Exemple #7

0

Afficher le fichier

async def fetchWithRetry(row, session):
    status = 400
    retry_cnt = 3
    sleep_time = 10
    TIMEOUT = 60

    while retry_cnt > 0 and status != 200:
        async with session.get(row["Url"],
                               ssl=ssl.create_default_context(
                                   purpose=ssl.Purpose.CLIENT_AUTH),
                               timeout=TIMEOUT) as response:
            res = await response.text()
            status = response.status
            if (status == 200 and len(res) != 0):
                pc.printSucc(
                    "\t\t <ID = {}><src= {} > ============== Scraping Done....... \t NOW: {}"
                    .format(row["ID"], row["SourceSite"],
                            time.strftime("%H:%M:%S", time.localtime())))
                urlstrings = text_actions.getUrlString(row["Content"])
                row["WeightedContent"] = text_actions.clean_text(
                    text_actions.weightedcontentfromhtml(res) + row["Title"] +
                    urlstrings)
                row["Content"] = text_actions.clean_text(
                    text_actions.contentfromhtml(res) + urlstrings)
                if (len(row["Content"]) == 0):
                    row["WeightedContent"] = text_actions.clean_text(
                        row["Title"])
                    row["Content"] = text_actions.clean_text(row["Title"])
                # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row["ID"],row["SourceSite"],time.strftime("%H:%M:%S", time.localtime())))
                # time.sleep(0.001)
                return row
            else:
                retry_cnt -= 1
                pc.printWarn(
                    "\t x---------------- Unable to hit URL(ERR_CODE={}): {}  Sleeping for {} Retries remaining = {} -------------x"
                    .format(status, row["Url"], sleep_time, retry_cnt))
                await asyncio.sleep(sleep_time)
    pc.printErr(
        "\t\txxxxx SKIPPING... for <ID = {}><src= {} > Unable to hit url: {} , "
        .format(row["ID"], row["SourceSite"], row["Url"]))
    global SKIPPED_ASYNC
    SKIPPED_ASYNC += 1
    return row

Exemple #8

0

Afficher le fichier

def run(ts):
    startTime = time.time()

    try:
        update_modelTags(ts)
    except Exception as e:
        pc.printErr(
            " xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running Tagger Simulator for wc table xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\t>>> Error = {}"
            .format(str(e)))
        logging.error(traceback.format_exc())
        pass

    endTime = time.time()

    pc.printSucc(
        "**************************** Tagger(Simulator) Run is Complete for wc **********************************************"
    )
    pc.printWarn(
        "| \t\t TIME TAKEN FOR Tagger(Simulator) Run(sec)     \t\t | \t\t {}  \t\t |"
        .format(round((endTime - startTime), 5)))
    pc.printSucc(
        "***********************************************************************************************************************\n\n"
    )

Exemple #9

0

Afficher le fichier

def run(ts):
    """
        Scrapes PH api for last 7 days & puts data in WP-DB.
            * Api supports daywaise only. So scrape for one day at a time
            * Link to documentation: https://api.producthunt.com/v1/docs/posts/posts_index_request_a_specific_day_with_the_%60day%60_parameter_(tech_category)
        * NOTE:
            * No threshold set on upvotes or comments rn.Maybe later?
            * API-Ratelimit: You can make up to 900 requests every 15 minutes, else gives `status 429` in response.If that happens, wait for 16 mins, then hit again.   
                * Retry 2 times; if failed nonetheless, skip!
            * Content = Tagline
            * URL: is the PH url only. Going to the product page & then finding the actual link is overkill
                * (this could also help later on getting their permission while monetizing)
            * Used self-retry logic. but check this package:: Read about requests.retries here: [doc](https://findwork.dev/blog/advanced-usage-python-requests-timeouts-retries-hooks/#retry-on-failure), [stkofw](https://stackoverflow.com/questions/23267409/how-to-implement-retry-mechanism-into-python-requests-library?rq=1)
        Input: ts (format: 1598692058.887741)

        * ============= row is an array with indices: 
        (ID(0), SourceSite(1), ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5), Url(6),ThumbnailUrl(7),SourceTags(8),NumUpvotes(9),NumComments(10),PopI(11),Content(12))
    """

    wp_db = 'dbs/wp.db'
    wp_table = 'wp_' + str(int(ts))
    pc.printSucc(
        '@[{}] >>>>>> Started PH-scraper ................... => TABLE: {}\n'.
        format(datetime.fromtimestamp(ts), wp_table))
    conn = sqlite3.connect(wp_db, timeout=10)
    c = conn.cursor()
    pc.printMsg(
        "\t -------------------------------------- < PH_SCRAPER: DB/wp Connection Opened > ---------------------------------------------\n"
    )
    startTime = time.time()
    """
        here is how you add day to `ts`:

        from datetime import datetime, timedelta
        newts = datetime.fromtimestamp(ts) + timedelta(days=1) # 2020-08-30 16:02:34.352094
        newts.timestamp() # 1598783633.284871
        datetime.fromtimestamp(ts) #2020-08-29 17:15:32
        # get date from it: 
        datetime.fromtimestamp(ts).date() #2020-08-29
    """
    """ days_arr has last 7 days(including today's) (YYYY-MM-DD)date strings ; just the way PH's API needs
    """
    curr_date = str(int(ts))
    days_arr = [str(datetime.fromtimestamp(int(ts)).date())]  # '2020-08-29'

    for i in range(6):
        new_ts = datetime.fromtimestamp(int(curr_date)) + timedelta(days=-1)
        new_ts = new_ts.timestamp()
        curr_date = new_ts
        days_arr.append(str(datetime.fromtimestamp(int(new_ts)).date()))

    PH_REQ_HEADERS = {
        "Accept": "application/json",
        "Content-Type": "application/json",
        "Authorization": "Bearer " + vault.PH_ACCESS_TOKEN,
        "Host": "api.producthunt.com"
    }

    # csv_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wp-db/wp_table_'+str(int(ts))+'.csv'
    index = gw.WP_TOTAL_ENTRIES_YET + 1

    for date in days_arr:
        pc.printMsg(
            " ................. scraping for date =  {} .................\n".
            format(date))
        url = 'https://api.producthunt.com/v1/posts?day=' + date
        try:
            data = web_requests.hitGetWithRetry(url, PH_REQ_HEADERS, False, 2,
                                                5, 10)
            if (data == -1):
                pc.printErr(
                    "\t\txxxxxx Unable to hit {} after 2 retries.Skipping this date( {} ) xxxxxx\n"
                    .format(url, date))
            else:
                items_arr = json.loads(data.content)["posts"]
                for item in items_arr:
                    # print(json.dumps(item, indent = 4))
                    """ get all the tags attached along with the item """
                    source_tags = []
                    for tag in item["topics"]:
                        source_tags.append(tag["name"])
                    entry = [
                        index, "PH",
                        datetime.fromtimestamp(ts).date(),
                        int(ts),
                        date_conversion.PHDate(str(item["created_at"])),
                        item["name"], item["discussion_url"],
                        item["thumbnail"]["image_url"],
                        json.dumps(source_tags), item["votes_count"],
                        item["comments_count"], '', item["tagline"]
                    ]
                    # csv_functions.putToCsv(csv_file,entry)
                    c.execute(
                        'INSERT INTO ' + wp_table +
                        ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
                    index = index + 1
                    gw.PH_TOTAL_ITEMS_GOT_YET += 1

        except Exception as e:
            pc.printErr(
                " \t xxxxxxxxxxxxx ERROR@PH_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n"
                .format(index, e))
            logging.error(traceback.format_exc())
            pass

        pc.printMsg("\t\t\t ====>> TOTAL_ENTRIES_YET = {}".format(
            gw.PH_TOTAL_ITEMS_GOT_YET))

    gw.WP_TOTAL_ENTRIES_YET += gw.PH_TOTAL_ITEMS_GOT_YET

    endTime = time.time()
    conn.commit()
    conn.close()
    pc.printMsg(
        "\t -------------------------------------- < PH_SCRAPER: DB/wp Connection Closed > ---------------------------------------------\n"
    )

    pc.printSucc(
        "\n\n***************************** PH Url Scraping is Complete. TABLE: {} ******************"
        .format(wp_table))
    print("\n\n")
    table = PrettyTable(['Entity (Post PH URL Scraping)', 'Value'])
    table.add_row(['TOTAL URLS FETCHED by PH', gw.PH_TOTAL_ITEMS_GOT_YET])
    table.add_row(['TOTAL ITEMS IN WP TABLE YET', gw.WP_TOTAL_ENTRIES_YET])
    table.add_row([
        'TIME TAKEN FOR URL SCRAPING-PH (sec) ',
        round((endTime - startTime), 5)
    ])
    pc.printSucc(table)
    print("\n\n")

Exemple #10

0

Afficher le fichier

def run(ts):
    """ I. Creates wc_table(in wc.db) & wp_table(in wp.dp) for the week
        II. Runs following scrapers serially and updates them in WC-DB:
            1. hn_scraper.py
            2. r_scraper.py
            4. ph_scraper.py => Api exists, Scraping not allowed(doint it anyway)
            3. ih_scraper.py => No Api, Scraping not allowed(postponed for later)

        Input: float(timestamp) - set when the main.py run is triggered
            * float because o/w `datetime.fromtimestamp(ts)` wont run on int
        Outpu: None, just put data in WC-DB
    """
    startTime = time.time()
    """ Initialize the weekly content tables in wc.db and wp.db"""

    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    conn = sqlite3.connect(wc_db, timeout=10)
    c = conn.cursor()
    c.execute(
        "SELECT count(name) FROM sqlite_master WHERE type='table' AND name='{}'"
        .format(wc_table))
    if c.fetchone()[0] == 1:  # table exists, flush away!
        c.execute("delete from {}".format(wc_table))
    else:  # creting new table
        c.execute(
            "CREATE TABLE {} (ID, SourceSite, ProcessingDate,ProcessingEpoch,CreationDate, Title, Url, SourceTags,ModelTags,NumUpvotes, NumComments, PopI,WeightedContent,Content)"
            .format(wc_table))

    pc.printSucc(
        "\n**************************************************** wc_table created => {} **************************************************** \n"
        .format(wc_table))

    wp_db = 'dbs/wp.db'
    wp_table = 'wp_' + str(int(ts))
    conn = sqlite3.connect(wp_db, timeout=10)
    c = conn.cursor()
    c.execute(
        "SELECT count(name) FROM sqlite_master WHERE type='table' AND name='{}'"
        .format(wp_table))
    if c.fetchone()[0] == 1:  # table exists, flush away!
        c.execute("delete from {}".format(wc_table))
    else:  # creting new table
        c.execute('''CREATE TABLE {}
                (ID, SourceSite, ProcessingDate,ProcessingEpoch,CreationDate, Title, Url, ThumbnailUrl,SourceTags,NumUpvotes, NumComments, PopI,Content)'''
                  .format(wp_table))

    pc.printSucc(
        "\n**************************************************** wp_table created => {} **************************************************** \n"
        .format(wp_table))
    """ Run the scrapers sequentially """
    pc.printWarn(
        ".   .   .   .   .   .   .   .   .   .   .   .   .   .   .   ...... Started Running all the scrapers ......    .   .   .   .   .   .   .   .   .   .   .   .   .   .   .\n"
    )

    try:
        hn_scraper.run(ts)
        pc.printSucc(
            "\n================ HH url scraper run: Complete ================\n"
        )
    except Exception as e:
        pc.printErr(
            " xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running Url Scraper-HN xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\t>>> Error = {}"
            .format(str(e)))
        logging.error(traceback.format_exc())
        pass

    try:
        r_scraper.run(ts)
        pc.printSucc(
            " \n================ Reddit url scraper run: Complete ================\n"
        )
    except Exception as e:
        pc.printErr(
            " xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running Url Scraper-Reddit xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\tError = {}"
            .format(str(e)))
        logging.error(traceback.format_exc())
        pass

    try:
        ph_scraper.run(ts)
        pc.printSucc(
            " \n================ PH url scraper run: Complete ================\n"
        )
    except Exception as e:
        pc.printErr(
            " xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running Url Scraper-PH xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\tError = {}"
            .format(str(e)))
        logging.error(traceback.format_exc())
        pass

    # try:
    #     ih_scraper.run(ts)
    #     print(" \n====== IH url scraper run: Complete ======\n")
    # except Exception as e:
    #     print(" XXXXXXXXXXXX Error in scraping IH for url XXXXXXXXXXXXXXXXX \n \t\tError = {}".format(str(e)))
    #     pass

    #TODO: add Lobsters here

    endTime = time.time()
    pc.printSucc(
        " ********************************************** URL Scraping(HN,r,PH) is complete *******************************************\n"
    )
    print("\n\n")
    table = PrettyTable(['Entity (Post all URL Scraping)', 'Value'])
    table.add_row(['TOTAL URL ITEMS IN WC TABLE ', gw.WC_TOTAL_URL_ENTRIES])
    table.add_row([
        'TIME TAKEN FOR URL SCRAPING-All (min) ',
        round((endTime - startTime) / 60, 2)
    ])
    pc.printSucc(table)
    print("\n\n")

Exemple #11

0

Afficher le fichier