Example #1
0
def make_helper_tables():
    """creates regex table, regex_id map table
        basically creates the tables used later for reconstruction the matches
        they don't contain the scientific names"""
    for row in session.query(
            Mapping):  #for each entry in the table mapping_cites
        # traitement common name
        entree = row.common_name.split(
            '; ')  #create list with common names as entries
        for name in entree:  #for each common name
            name.lower()  #get letters to be in lowercase
            list_of_words = name.split(
                ' ')  #create the a list of the words composing the common name
            for i in list_of_words:
                if len(i) <= 2:  #let alone small words
                    pass
                else:  #big words - we do something with them: namely
                    if type(
                            i
                    ) == str:  #check if string, if not string we just sit idle
                        res = word_to_regex(i)
                    else:
                        print('problem with input')
                        res = None
                    if len(
                            i
                    ) < 5:  #for small words we have additional requirements before they are added to the list
                        if ('\s' in res) or ('\w' in res) or (res == ''):
                            print('exception, pass short error', res)
                            res = None
                        else:
                            if session.query(Regex).filter(
                                    Regex.reg == res
                            ).scalar(
                            ) == None and res != None:  #if there isn't already an entry and the regex isn't problematic
                                entry = Regex(reg=res,
                                              word=word.lower().strip(
                                                  ';'))  #create entry
                                entry.insertregex(session)  #insert entry
                                session.commit()  #commit entry
                    #for long words len>=5
                    elif session.query(Regex).filter(Regex.reg == res).scalar(
                    ) == None and res != None:  #if entry doesn't exist: create entry in regex database
                        word = i.strip(';')
                        entry = Regex(reg=res, word=word.lower().strip(';'))
                        entry.insertregex(session)
                        session.commit()
                    else:  #if no entry pass
                        pass
                    #next step: fillup  Match_Regex_IdMap
                    requested_re = session.query(
                        Regex.id).filter(Regex.reg == res).scalar()
                    request = session.query(Match_Regex_IdMap.id).filter_by(
                        id_re=requested_re, id_map=row.id).scalar()
                    if request == None and requested_re != None:  #if there isn't an entry (request) and the regular_expression exists in Regex (requested_re)
                        entry = Match_Regex_IdMap(id_re=requested_re,
                                                  id_map=row.id)  #create entry
                        entry.insertMatch(session)  #insert
                        session.commit()  #commit
Example #2
0
def re_generator_species():
    """Create a dictionary {id_species : {common name 1 : regex1, ....}}. These regexes
    will be used to multiples times, so we create them for once for each parsing. We can change
    the code here to adapt the way of create these regexes. The adaption used can be used to create
    different classification. The classification 2 checks the presence of the words of one common name in
    the text. The classification 3 considers the order of the word and the proximity."""
    regex_classification_2 = {}
    regex_classification_3 = {}
    for row in session.query(Mapping):
        #List of common names (stop_names are the common names too general that might match with everything [e.g. Little blue macaw])
        cns = [
            _.strip(" ") for _ in row.common_name.split(";")
            if (len(_.strip(" ")) > 0 and _.strip(" ") not in stop_names)
        ]
        #Add the scientific name
        cns.append(row.scientific_name_cites)
        #List of list of termes included in common names without little words
        cns_decomposed = [[
            str.lower(_) for _ in re.split(" |-|'", first) if (len(_) > 2)
        ] for first in cns if (len(first) > 0)]  #re.split(" -", first)
        #Drop useless words to caracterize a specie (e.g. "and")
        cns_decomposed = [[word for word in l if (word not in useless_words)]
                          for l in cns_decomposed]
        #Drop common bird denomination since several ads don't mention it (it's trivial) (e.g. "parrot")
        #Keep it if there remains only one word
        cns_decomposed = [
            list(filter(lambda x: x not in too_common_words, l))
            if len(list(filter(lambda x: x not in too_common_words, l))) >= 2
            else l for l in cns_decomposed
        ]
        print(cns_decomposed)
        #Replace each letter with its mitigation in the mitigation dic
        miss_cns = map(
            lambda list_words: ("".join([
                mp_mit_2[char] if (char in mp_mit_2.keys()) else char
                for char in list(word)
            ]) for word in list_words), cns_decomposed)
        #Interpret map object
        miss_cns = [list(_) for _ in list(miss_cns)]
        dict_regex_2 = {}
        dict_regex_3 = {}
        #Populate the dict with regex according to each name
        for name_decomposed, name in zip(miss_cns, cns):
            #Regex to only find the words in the text. Thus we don't want another letter before to avoid matching "reared" searching for "red"
            reg_2 = "".join(
                [f"(?=.*[^0-9a-zÀ-ÿ]{word})" for word in name_decomposed])
            reg_2 = f"^{reg_2}.*"
            dict_regex_2[name] = reg_2
            #Regex to find the words in a specific order in a specific proximity
            reg_3 = ".{0,10}".join([f"{word}" for word in name_decomposed])
            reg_3 = f"^(?=.*({reg_3})).*"
            dict_regex_3[name] = reg_3

        regex_classification_2[row.id] = (dict_regex_2, row.annex_number_CITES)
        regex_classification_3[row.id] = (dict_regex_3, row.annex_number_CITES)
    return (regex_classification_2, regex_classification_3)
Example #3
0
def check_update(browser, session):
    """Give TRUE whether the database is up to date"""
    firstad = browser.driver.find_elements_by_xpath(
        '//div[@class="row clearfix"][@style]')[0]
    ad_number = firstad.find_element_by_xpath(
        ".//input[@type=\"checkbox\"]").get_attribute("name")
    country = map_country(browser.driver.current_url)
    resp = session.query(exists().where(
        and_(Urls_ads.ad_number == ad_number,
             Urls_ads.country_id == country))).scalar()
    return resp
Example #4
0
def make_dictionnary():
    """Creates dictionnary for small multiple (Script classification_1_visualisation), this dictonary contains the words corresponding to the regex used for classification 1"""
    result_dict = {}
    for row in session.query(Mapping):  #for each ligne in table mapping
        result = []  #temporary list
        #add scientific name
        result.append(
            row.scientific_name_cites.lower())  #append scientific name to list
        #add common names
        entry = row.common_name.split(
            '; ')  #create a list with each common name as an entry
        for name in entry:  #for each name in the list created in the previous line
            name.lower()  #get all letters to lowercase
            list_of_words = name.split(
                ' '
            )  #split the common name into it's components [this creates again a list]
            for i in list_of_words:  #for each entry given here
                try:
                    res = word_to_regex(
                        i)  #try to make a regular expression from the word
                except:
                    res = None
                if len(i) <= 2:  #let alone small words
                    pass
                elif len(i) < 5 and (
                    ('\s' in res) or ('\w' in res) or (res == '')
                ):  #if there is a problem in the creation of the word, let it alone
                    #and with problem are ment words smaller than 5 letters containing space caracters, commonplace caracters or empty fields
                    pass
                else:  #big words - we do something with them: namely
                    if type(
                            i
                    ) == str:  #check if string, if not string we just sit idle
                        if res != None:
                            result.append(i.lower().strip(
                                ';'))  #append lower case result without ';'
        result_dict[
            row.
            id] = result  #for each entry in mapping append the list of used words
    return result_dict  #return dictionary
Example #5
0

if __name__ == '__main__':

    #Documentation
    cT = datetime.datetime.now()
    date_parsing = f"{str(cT.year)}-{str(cT.month)}-{str(cT.day)}_{str(cT.hour)}-{str(cT.minute)}"
    doc = Documentation()
   
    path_result='./results/getCodes/codes/'
    
    #Create the directory
    os.makedirs(os.path.dirname("./results/parseCodes/documentation/"), exist_ok=True)

    #Iterate through client codes
    for row in session.query(Ads_Codes).filter_by(status=0): #status=0
        #Skip if already exists
        if session.query(exists().where(Parse_ads.ad_id == row.ad_id)).scalar():
            pass
        else:
            #Copy the global variable containing the fields in the ad
            dic_champs=dict_champ.copy()
            filename=row.client_code
            #Set it up to use later
            dic_champs["Ad Number"]=row.ad_number
            #Obtain the HTML object
            objet = lxml.html.parse(f"{path_result}{filename}").getroot()
            #The main function that parses the HTML object
            dic_champs = get_champs(dic_champs, objet, doc)
            entry = create_entry(dic_champs, row)
Example #6
0
                        Regex.id).filter(Regex.reg == res).scalar()
                    request = session.query(Match_Regex_IdMap.id).filter_by(
                        id_re=requested_re, id_map=row.id).scalar()
                    if request == None and requested_re != None:  #if there isn't an entry (request) and the regular_expression exists in Regex (requested_re)
                        entry = Match_Regex_IdMap(id_re=requested_re,
                                                  id_map=row.id)  #create entry
                        entry.insertMatch(session)  #insert
                        session.commit()  #commit


if __name__ == '__main__':
    if status_modified:  #if we changed something with the generation of regexes or the mapping table
        make_helper_tables()  #regenerate helper tables
    res = make_dictionnary()
    #générer list_scientific
    for row in session.query(Mapping):
        a = word_to_regex(row.scientific_name_cites)
        list_scientific.append(a)

    path_result = './results/parse/'
    #Documentation
    cT = datetime.datetime.now()
    date_parsing = f"{str(cT.year)}-{str(cT.month)}-{str(cT.day)}_{str(cT.hour)}-{str(cT.minute)}"
    doc = Documentation()

    #create entries for Parsing_Psittasiformes (one per ad)
    #the match_scientific and match_common only take values either 1 (match found) or 0 (match not found)
    #list_common lists all the regular expressions found
    for row in session.query(Parse_ads):  #for each parsed ad
        if session.query(
                Parsing_Psittaciformes_or_no.ad_id
Example #7
0
def resume_extraction(browser, session, pages):
    """Check if the first ad of the page is in the database. Otherwise pass n pages per n pages until the first new ad
    and then go back n pages further. The purpose is to locate the interval of n pages where the script has stopped.    
    This function has been made because there is no way to select a specific page on the website or go n pages further (See the UPDATE WARNING in the main)"""

    #Obtain the total number of pages for this country at this moment, the purpose is to avoid reaching the last page
    wait = WebDriverWait(browser.driver, 90)
    wait.until(
        EC.presence_of_element_located(
            (By.XPATH, "//div[@style][contains(text(),\"Number of ads: \")]")))
    raw_string = browser.driver.find_element_by_xpath(
        "//div[@style][contains(text(),\"Number of ads: \")]").text
    total_pages = ceil(
        int(re.findall("Number of ads: (\d*)\. .*", raw_string)[0]) /
        ADS_PER_PAGE)
    #Extract the first ad before the loop
    wait = WebDriverWait(browser.driver, 90)
    wait.until(
        EC.presence_of_element_located(
            (By.XPATH, '//div[@class="row clearfix"][@style]')))
    firstad = browser.driver.find_elements_by_xpath(
        '//div[@class="row clearfix"][@style]')[0]
    ad_number = firstad.find_element_by_xpath(
        ".//input[@type=\"checkbox\"]").get_attribute("name")
    country = map_country(browser.driver.current_url)

    #Counter that counts the number of pages that have been passed
    counter = 0
    while session.query(exists().where(
            and_(Urls_ads.ad_number == ad_number, Urls_ads.country_id
                 == country))).scalar() and not counter == total_pages - 1:
        for n in range(pages):
            time.sleep(random.uniform(3, 3.2))
            test = 0
            while not test:
                try:
                    wait = WebDriverWait(browser.driver, 90)
                    wait.until(
                        EC.presence_of_element_located(
                            (By.XPATH, "//input[@name=\"button_hits_seen\"]")))
                    browser.driver.find_element_by_xpath(
                        "//input[@name=\"button_hits_seen\"]").click()
                    test = 1
                except WebDriverException as e:
                    print(f"{e}\n")
                    doc.adderrorlog(f"{e}\n")
                    # ~~~~~~~~~~~~~~~ Documentation - enregistrement (overwritten) ~~~~~~~~~~~~~~~ #
                    with open(
                            f'./results/getArticles/{date_extraction}_{filename_prefix}_documentation.json',
                            'wb') as f:
                        f.write(str(doc).encode('utf-8'))
                    #The webdriver is on a error page, go back
                    browser.driver.back()
            counter += 1
            #Need to break otherwise an error pops up since we're trying to the next button that doesn't exist anymore at the end
            print(
                f"{country} - Skipped pages : {counter} / Total pages : {total_pages}\n"
            )
            doc.addlog(
                f"{country} - Skipped pages : {counter} / Total pages : {total_pages}\n"
            )
            if counter == total_pages - 1:
                break
        test = 0
        while test:
            try:
                wait = WebDriverWait(browser.driver, 90)
                wait.until(
                    EC.presence_of_element_located(
                        (By.XPATH, '//div[@class="row clearfix"][@style]')))
                firstad = browser.driver.find_elements_by_xpath(
                    '//div[@class="row clearfix"][@style]')[0]
                ad_number = firstad.find_element_by_xpath(
                    ".//input[@type=\"checkbox\"]").get_attribute("name")
                test = 1
            except NoSuchWindowException as e:
                print(f"{e}\n")
                doc.adderrorlog(f"{e}\n")
                # ~~~~~~~~~~~~~~~ Documentation - enregistrement (overwritten) ~~~~~~~~~~~~~~~ #
                with open(
                        f'./results/getArticles/{date_extraction}_{filename_prefix}_documentation.json',
                        'wb') as f:
                    f.write(str(doc).encode('utf-8'))
                #The webdriver is on a error page, go back
                browser.driver.back()
            except WebDriverException as e:
                print(f"{e}\n")
                doc.adderrorlog(f"{e}\n")
                # ~~~~~~~~~~~~~~~ Documentation - enregistrement (overwritten) ~~~~~~~~~~~~~~~ #
                with open(
                        f'./results/getArticles/{date_extraction}_{filename_prefix}_documentation.json',
                        'wb') as f:
                    f.write(str(doc).encode('utf-8'))
                #The webdriver is on a error page, go back
                browser.driver.back()
    #Go back n pages whether we are not at the first page. We check with the presence of the previous button
    if check_exists_by_xpath(browser.driver,
                             "//input[@name=\"previous_hits_button\"]"):
        #Number of pages to go back according to the number we have really skip
        back_pages = counter % pages if pages != counter else pages
        for n in range(back_pages + 1):
            time.sleep(random.uniform(0.2, 1))
            test = 0
            while not test:
                try:
                    wait = WebDriverWait(browser.driver, 90)
                    wait.until(
                        EC.presence_of_element_located(
                            (By.XPATH,
                             "//input[@name=\"previous_hits_button\"]")))
                    browser.driver.find_element_by_xpath(
                        "//input[@name=\"previous_hits_button\"]").click()
                    test = 1
                except WebDriverException as e:
                    print(f"{e}\n")
                    doc.adderrorlog(f"{e}\n")
                    # ~~~~~~~~~~~~~~~ Documentation - enregistrement (overwritten) ~~~~~~~~~~~~~~~ #
                    with open(
                            f'./results/getArticles/{date_extraction}_{filename_prefix}_documentation.json',
                            'wb') as f:
                        f.write(str(doc).encode('utf-8'))
                    #The webdriver is on a error page, go back
                    browser.driver.back()
            counter -= 1
        print(f"{country} : Go back {back_pages} pages\n")
        doc.addlog(f"{country} : Go back {back_pages} pages\n")
    doc.addlog(
        f"{country} : To resume the extraction : {counter} have been passed per {pages} pages interval"
    )
    print(f"{country} : SUCCESS Resume")
Example #8
0
def getads(browser, session, pages=20, update=True):
    """Go through all pages to collect articles' urls, number of pages to search the last stop. Whether there are
    new recent articles, the function updates the database rather than resume the extraction"""
    added_ad = 0
    #Get the current country
    country = map_country(browser.driver.current_url)
    #Check we are updating
    print(f"{country} : Updating the ads") if not check_update(
        browser, session) else None
    doc.addlog(f"{country} : Updating the ads")
    #If the country has not yet been extracted, no resume. If number of entries < the pages interval, no resume.
    nbr_entries_country = len(
        session.query(Urls_ads).filter(Urls_ads.country_id == country).all())
    if nbr_entries_country == 0 or nbr_entries_country < pages:
        pass
    else:
        #Resume the extraction
        print(f"{country} : Resuming extraction...")
        doc.addlog(f"{country} : Resuming extraction...")
        resume_extraction(browser, session, pages)
    #Just to avoid passing the first page
    counter = 0
    #Allow to break whether we have updated the start of the ads and there is no more new ads
    counter_not_new = 0
    #No need to wait between requests, it is on the same page, just javascript
    #When the next button disappears at the end
    while check_exists_by_xpath(browser.driver,
                                "//input[@name=\"button_hits_seen\"]"):
        #Click on the "next button" / Except the first page
        time.sleep(random.uniform(2, 2.5))
        #Try until it works or CTRL-C
        test = 0
        while not test:
            try:
                wait = WebDriverWait(browser.driver, 90)
                wait.until(
                    EC.presence_of_element_located(
                        (By.XPATH, "//input[@name=\"button_hits_seen\"]")))
                browser.driver.find_element_by_xpath(
                    "//input[@name=\"button_hits_seen\"]").click(
                    ) if counter == 1 else None
                counter = 1
                test = 1
            except TimeoutException as e:
                print(f"{e}\n")
                doc.adderrorlog(f"{e}\n")
                # ~~~~~~~~~~~~~~~ Documentation - enregistrement (overwritten) ~~~~~~~~~~~~~~~ #
                with open(
                        f'./results/getArticles/{date_extraction}_{filename_prefix}_documentation.json',
                        'wb') as f:
                    f.write(str(doc).encode('utf-8'))
                #Timeout because there is no more next button, break the loop
                pass
            except WebDriverException as e:
                print(f"{e}\n")
                doc.adderrorlog(f"{e}\n")
                # ~~~~~~~~~~~~~~~ Documentation - enregistrement (overwritten) ~~~~~~~~~~~~~~~ #
                with open(
                        f'./results/getArticles/{date_extraction}_{filename_prefix}_documentation.json',
                        'wb') as f:
                    f.write(str(doc).encode('utf-8'))
                #The webdriver is on a error page, go back
                browser.driver.back()
        test = 0
        while not test:
            try:
                wait = WebDriverWait(browser.driver, 90)
                wait.until(
                    EC.presence_of_element_located(
                        (By.XPATH, '//div[@class="row clearfix"][@style]')))
                for ad in browser.driver.find_elements_by_xpath(
                        '//div[@class="row clearfix"][@style]'):
                    #The website is inconsistent, there is tag without ad
                    ad_number = ad.find_element_by_xpath(
                        ".//input[@type=\"checkbox\"]").get_attribute("name")
                    url = ad.find_element_by_xpath(".//a").get_attribute(
                        "href")
                    #Avoid the stale element error
                    test = 1
                    #Check if the entry already exists and do nothing in case according to the ad_number and country
                    if session.query(exists().where(
                            and_(Urls_ads.ad_number == ad_number,
                                 Urls_ads.country_id == country))).scalar():
                        counter_not_new += 1
                    else:
                        #Refresh the counter since an entry has been added
                        counter_not_new = 0
                        entry = Urls_ads(
                            url=url,
                            ad_id=f"{ad_number}_{get_abr_country(url)}",
                            ad_number=int(ad_number),
                            country_id=country)
                        #Add the entry in the database
                        entry.insertURL(session)
                        added_ad += 1
                        print(f"{country} : Ad added (Tot : {added_ad})\n")
                        doc.addlog(
                            f"{country} : Ad added (Tot : {added_ad})\n")
            #Since the code is running a long time, sometimes we have a "stale element"
            except StaleElementReferenceException as e:
                print(f"{e}\n")
                doc.adderrorlog(f"{e}\n")
                # ~~~~~~~~~~~~~~~ Documentation - enregistrement (overwritten) ~~~~~~~~~~~~~~~ #
                with open(
                        f'./results/getArticles/{date_extraction}_{filename_prefix}_documentation.json',
                        'wb') as f:
                    f.write(str(doc).encode('utf-8'))
                #The webdriver is on a error page, go back
                print(f"{country} : Refreshing...\n")
                doc.addlog(f"{country} : Refreshing...")
                browser.driver.refresh()
        print(
            f"{country} :\n\tNext page\n\tNo new entry since {counter_not_new} entries\n"
        )
        doc.addlog(
            f"{country} :\n\tNext page\n\tNo new entry since {counter_not_new} entries\n"
        )
        if counter_not_new > (pages * ADS_PER_PAGE):
            #We have updated the country
            print(f"{country} : Ads have been updated")
            break

    print(f"{country} : No more ads\n")
    doc.addlog(f"{country} : No more ads\n")
Example #9
0
    #~~~~~~~~~~~~~~~ Catch'em all ~~~~~~~~~~~~~~~#
    """REMOVE STRING TO UPDATE THE COUNTRY => That's the only solution you can't go to the end of the ads' list
    and you can't select a specific page AND you can't sort by age (Advanced search doesn't work at the moment)
    For this reason, the code contains a great amount of try/except since we need to go across all pages manually, 
    this increases the chance of a bug and we need to handle them
    
    
    UPDATE WARNING : Le code peut être grandement amélioré car le numéro de page dans l'URL est visible si l'on clique
    sur le bouton pour développer plus d'annonce. Le code ici ne prenait pas cela en compte. Il n'a pas été amélioré car 
    le crawling avait déjà été fait et le présent code peut extraire les données à mettre à jour.
    Ici malheureusement, il faut être sûr d'avoir toutes les annonces jusqu'à la dernière page avant de pouvoir faire de la veille."""

    completed_countries = [
    ]  #["UNITED STATES", "CANADA", "UNITED KINGDOM", "IRELAND", "AUSTRALIA", "NEW ZEALAND", "MALAYSIA", "INDONESIA", "HONG KONG", "INDIA", "SINGAPORE", "PHILIPPINES"] #REMOVE TO UPDATE
    for row in session.query(Country).all():
        url = row.url

        info = getbirds(browser, url)
        doc.info['selenium'] = []
        doc.info['selenium'].append(info)

        #Pass the country whether completed
        country = map_country(browser.driver.current_url)
        doc.addlog(f"{country} : info = getbirds(browser, url)")
        if country in completed_countries:
            print(f"{country} : Passed")
            doc.addlog(f"{country} : Passed")
            pass
        else:
if __name__ == '__main__':
    #Global variable which contains the expressions to match
    list_of_birds_test = ["bird", "brd", "amazon", "amazona", "parot", "prot", "african grey", "macaw", "mcw",
                          "macw", "mcaw", "macow", "cockato", "winged", "paraket", "lovebird", "canary", "cnry"]
    #list_of_birds is the list of each regular expression created with the words in list_of_birds_test
    list_of_birds = create_regex_for_birds(list_of_birds_test)
    print(list_of_birds)

    #Documentation
    cT = datetime.datetime.now()
    date_parsing = f"{str(cT.year)}-{str(cT.month)}-{str(cT.day)}_{str(cT.hour)}-{str(cT.minute)}"
    doc = Documentation()
    path_result = './results/classification/'

    #Parse database
    for row in session.query(Parse_ads):

        #If ad (ad_id) not yet classified (0 or 1)
        if session.query(Parsing_bird_or_no.ad_id).filter_by(ad_id=row.ad_id).scalar() == None:

            #Step 1 : search in the title for each regular expression of list_of_birds
            for expression in list_of_birds:
                #The variable res is the string of the title
                res = re.search(str(expression), row.title)
                #If there is a match
                if res != None:
                    #And if there isn't already an entry
                    if session.query(Parsing_bird_or_no.status_bird).filter_by(ad_id=row.ad_id).scalar() == None:
                        #The entry is the ad_id and the status is 1
                        entry = Parsing_bird_or_no(ad_id=row.ad_id, status_bird=1)
                        entry.insertParse_bird(session)
Example #11
0
#Strategy: Look in title for words describing birds with regular expressions
list_of_birds_test = ["bird","brd","amazon","amazona","parot", "prot", "african grey","macaw","mcw","macw","mcaw","macow","cockato","winged","paraket"] #Global variable which contains re to match
list_of_birds = []
for i in list_of_birds_test:
    a = word_to_regex(i)
    list_of_birds.append(a)

if __name__ == '__main__':
    path_result = './results/parse/'
    #Documentation
    cT = datetime.datetime.now()
    date_parsing = f"{str(cT.year)}-{str(cT.month)}-{str(cT.day)}_{str(cT.hour)}-{str(cT.minute)}"
    doc = Documentation()
    #parse database
    c = 0 #counter to trace vow many ads have status 1 = classified as bird
    for row in session.query(Parse_ads):
        if session.query(Parsing_bird_or_no.ad_id).filter_by(ad_id=row.ad_id).scalar() == None:
        #step 1 search in title
            for expression in list_of_birds:
                #For each defined regular expression
                res = re.search(expression, row.title) #search in title
                if res != None: #if there is a match, go on
                    if session.query(Parsing_bird_or_no.status_bird).filter_by(ad_id=row.ad_id).scalar() == None: #if there isn't already an entry
                        entry = Parsing_bird_or_no(ad_id=row.ad_id, status_bird=1)
                        entry.insertParse_bird(session)
                        session.commit()
                        c+=1
                        pass
        #step 2 search in description
            for expression in list_of_birds:
                if row.description != None:
Example #12
0
                except:
                    price_final_f = montant_f  #if error occurs; do nothing
    return price_final_f, res_currency


def entry_ad_clean(row, id_vendor, price, currency, price_in_dollar):
    """function to make an entry into ads_clean table"""
    entry = Ads_clean(ad_id = row.ad_id, ad_number = row.ad_number, id_vendor=id_vendor, title = row.title,\
    description = row.description, breed = row.breed, age = row.age, sex = row.sex, primary_color = row.primary_color,\
    secondary_color = row.secondary_color, price = price, currency = currency, price_in_dollar= price_in_dollar,payment_forms = row.payment_forms)
    entry.insertAds_clean(session)
    session.commit()


if __name__ == '__main__':
    for row in session.query(Parse_ads):
        #get status_parrot of the ad
        status_parrot = session.query(
            Classification_3_Ads.parrot).filter_by(ad_id=row.ad_id).scalar(
            )  #checks if the ad is classified as parrot
        # get email, website, phone of the ad
        if row.description != None:  #if there is a description
            email = get_email(row)
            website = get_website(row)
            phone = get_phone(row)
        else:  #else put it to None
            email = None
            website = None
            phone = None
        #create vendor entry if there isn't one
        if session.query(Vendor_analyse).filter_by(pseudo=row.pseudo).scalar(
Example #13
0
    #Create the directory
    os.makedirs(os.path.dirname("./results/classification_2_3/documentation/"),
                exist_ok=True)

    #~~~~~~~~~~~~~~ Create Regexes ~~~~~~~~~~~~~~
    dic_regexes = re_generator_species()
    doc.info["regexes"] = dic_regexes
    doc.addlog("Create regexes")
    doc.info["cage_regex"] = re_hasCage
    doc.info["isbird_regex"] = re_isBird

    #2 Dict with regexes for the 2 classifications.
    for dr, classification in zip(
            dic_regexes, (Classification_2_Ads, Classification_3_Ads)):

        for row in session.query(Parse_ads):
            #Skip if already exists
            if session.query(exists().where(
                    classification.ad_id == row.ad_id)).scalar():
                pass
            else:
                entry = search_re(ad=row,
                                  regexes=dr,
                                  classification=classification)
                print(f"{row.ad_id}...\n")
                doc.addlog(f"Search in ad {row.ad_id}")
                entry.insert(session)

        #Write the doc several time to lost the documentation whether the script fails.
        with open(
                f'./results/classification_2_3/documentation/{date_parsing}_documentation.json',
Example #14
0
from ressources.documentation import Documentation  # fichier documentation.py qui se trouve dans le dossier ressources
from ressources.db import session, updateURL, Url  #fichier db.py  qui se trouve dans le dossier ressources


def saveData(browser, filename_prefix='selenium'):
    '''Fonction pour l'exemple qui enregistre le code client, la capture d'écran et code serveur'''
    browser.clientCode('./results/html/' + filename_prefix +
                       '_clientCode.html')
    browser.screenshot(
        './results/screenshots/' + filename_prefix + '_screenshot.png',
        width=1080)  #on fixe la largeur de la fenêtre avec width


if __name__ == '__main__':
    doc = Documentation()
    # ~~~~~~~~~~~~~~~ Début Selenium ~~~~~~~~~~~~~~~ #
    browser = Chrome(headless=True)  #ou Chrome(...)

    # ~~~~~~~~~~~~~~~ Récupération des URL à parcourir ~~~~~~~~~~~~~ #
    for i in session.query(Url).filter_by(status=0):
        doc.info['selenium'] = []
        info = browser.get(i.url)
        doc.info['selenium'].append(info)
        saveData(browser, filename_prefix=str(i.id))
        with open(
                './results/documentation/' + str(i.id) + '_documentation.json',
                'wb') as f:
            f.write(str(doc).encode('utf-8'))
        updateURL(session, i)  #met à jour le status de l'URL
        time.sleep(random.uniform(0.1, 0.2))  #attente entre 1.5 et 2.5 sec