Example #1
0
def entry_ad_clean(row, id_vendor, price, currency, price_in_dollar):
    """function to make an entry into ads_clean table"""
    entry = Ads_clean(ad_id = row.ad_id, ad_number = row.ad_number, id_vendor=id_vendor, title = row.title,\
    description = row.description, breed = row.breed, age = row.age, sex = row.sex, primary_color = row.primary_color,\
    secondary_color = row.secondary_color, price = price, currency = currency, price_in_dollar= price_in_dollar,payment_forms = row.payment_forms)
    entry.insertAds_clean(session)
    session.commit()
Example #2
0
def make_helper_tables():
    """creates regex table, regex_id map table
        basically creates the tables used later for reconstruction the matches
        they don't contain the scientific names"""
    for row in session.query(
            Mapping):  #for each entry in the table mapping_cites
        # traitement common name
        entree = row.common_name.split(
            '; ')  #create list with common names as entries
        for name in entree:  #for each common name
            name.lower()  #get letters to be in lowercase
            list_of_words = name.split(
                ' ')  #create the a list of the words composing the common name
            for i in list_of_words:
                if len(i) <= 2:  #let alone small words
                    pass
                else:  #big words - we do something with them: namely
                    if type(
                            i
                    ) == str:  #check if string, if not string we just sit idle
                        res = word_to_regex(i)
                    else:
                        print('problem with input')
                        res = None
                    if len(
                            i
                    ) < 5:  #for small words we have additional requirements before they are added to the list
                        if ('\s' in res) or ('\w' in res) or (res == ''):
                            print('exception, pass short error', res)
                            res = None
                        else:
                            if session.query(Regex).filter(
                                    Regex.reg == res
                            ).scalar(
                            ) == None and res != None:  #if there isn't already an entry and the regex isn't problematic
                                entry = Regex(reg=res,
                                              word=word.lower().strip(
                                                  ';'))  #create entry
                                entry.insertregex(session)  #insert entry
                                session.commit()  #commit entry
                    #for long words len>=5
                    elif session.query(Regex).filter(Regex.reg == res).scalar(
                    ) == None and res != None:  #if entry doesn't exist: create entry in regex database
                        word = i.strip(';')
                        entry = Regex(reg=res, word=word.lower().strip(';'))
                        entry.insertregex(session)
                        session.commit()
                    else:  #if no entry pass
                        pass
                    #next step: fillup  Match_Regex_IdMap
                    requested_re = session.query(
                        Regex.id).filter(Regex.reg == res).scalar()
                    request = session.query(Match_Regex_IdMap.id).filter_by(
                        id_re=requested_re, id_map=row.id).scalar()
                    if request == None and requested_re != None:  #if there isn't an entry (request) and the regular_expression exists in Regex (requested_re)
                        entry = Match_Regex_IdMap(id_re=requested_re,
                                                  id_map=row.id)  #create entry
                        entry.insertMatch(session)  #insert
                        session.commit()  #commit
Example #3
0
def entry_vendor(row, email, phone, website, status_parrot):
    """makes entry into table vendor_analyse, returns nothing"""
    print('make vendor entry')  #status in command line
    entry = Vendor_analyse(
        pseudo=row.pseudo,
        contact_information=row.contact_information,
        name=row.name,
        company=row.company,
        zip=row.zip,
        city=row.city,
        state=row.state,
        country=row.country,
        county=row.county,
        region=row.region,
        province=row.province,
        email=row.email,
        email_description=email,
        phone=row.phone,
        phone_description=phone,
        redirect_website=row.redirect_website,
        website_deviate=website,
        status_parrot=status_parrot)  # tous les column sauf id
    entry.insertVendor_analyse(session)
    session.commit()
Example #4
0
   
    path_result='./results/getCodes/codes/'
    
    #Create the directory
    os.makedirs(os.path.dirname("./results/parseCodes/documentation/"), exist_ok=True)

    #Iterate through client codes
    for row in session.query(Ads_Codes).filter_by(status=0): #status=0
        #Skip if already exists
        if session.query(exists().where(Parse_ads.ad_id == row.ad_id)).scalar():
            pass
        else:
            #Copy the global variable containing the fields in the ad
            dic_champs=dict_champ.copy()
            filename=row.client_code
            #Set it up to use later
            dic_champs["Ad Number"]=row.ad_number
            #Obtain the HTML object
            objet = lxml.html.parse(f"{path_result}{filename}").getroot()
            #The main function that parses the HTML object
            dic_champs = get_champs(dic_champs, objet, doc)
            entry = create_entry(dic_champs, row)

            session.commit()
            entry.insertParse_ads(session)
            row.update(session)

        #Write the doc several time to lost the documentation whether the script fails.
        with open(f'./results/parseCodes/documentation/{date_parsing}_documentation.json', 'wb') as f:
                    f.write(str(doc).encode('utf-8'))
        #If ad (ad_id) not yet classified (0 or 1)
        if session.query(Parsing_bird_or_no.ad_id).filter_by(ad_id=row.ad_id).scalar() == None:

            #Step 1 : search in the title for each regular expression of list_of_birds
            for expression in list_of_birds:
                #The variable res is the string of the title
                res = re.search(str(expression), row.title)
                #If there is a match
                if res != None:
                    #And if there isn't already an entry
                    if session.query(Parsing_bird_or_no.status_bird).filter_by(ad_id=row.ad_id).scalar() == None:
                        #The entry is the ad_id and the status is 1
                        entry = Parsing_bird_or_no(ad_id=row.ad_id, status_bird=1)
                        entry.insertParse_bird(session)
                        session.commit()
                        pass

            #Step 2 : search in the description for each regular expression of list_of_birds
            for expression in list_of_birds:
                #If a description exists for this ad
                if row.description != None:
                    try:
                        #The variable res is the string of the description
                        res = re.search(str(expression), row.description)
                    except:
                        #Otherwise raise and unknown error
                        print('unknown error')
                        print(row.ad_id)
                        res = None
                #If there is a match