def entry_ad_clean(row, id_vendor, price, currency, price_in_dollar): """function to make an entry into ads_clean table""" entry = Ads_clean(ad_id = row.ad_id, ad_number = row.ad_number, id_vendor=id_vendor, title = row.title,\ description = row.description, breed = row.breed, age = row.age, sex = row.sex, primary_color = row.primary_color,\ secondary_color = row.secondary_color, price = price, currency = currency, price_in_dollar= price_in_dollar,payment_forms = row.payment_forms) entry.insertAds_clean(session) session.commit()
def make_helper_tables(): """creates regex table, regex_id map table basically creates the tables used later for reconstruction the matches they don't contain the scientific names""" for row in session.query( Mapping): #for each entry in the table mapping_cites # traitement common name entree = row.common_name.split( '; ') #create list with common names as entries for name in entree: #for each common name name.lower() #get letters to be in lowercase list_of_words = name.split( ' ') #create the a list of the words composing the common name for i in list_of_words: if len(i) <= 2: #let alone small words pass else: #big words - we do something with them: namely if type( i ) == str: #check if string, if not string we just sit idle res = word_to_regex(i) else: print('problem with input') res = None if len( i ) < 5: #for small words we have additional requirements before they are added to the list if ('\s' in res) or ('\w' in res) or (res == ''): print('exception, pass short error', res) res = None else: if session.query(Regex).filter( Regex.reg == res ).scalar( ) == None and res != None: #if there isn't already an entry and the regex isn't problematic entry = Regex(reg=res, word=word.lower().strip( ';')) #create entry entry.insertregex(session) #insert entry session.commit() #commit entry #for long words len>=5 elif session.query(Regex).filter(Regex.reg == res).scalar( ) == None and res != None: #if entry doesn't exist: create entry in regex database word = i.strip(';') entry = Regex(reg=res, word=word.lower().strip(';')) entry.insertregex(session) session.commit() else: #if no entry pass pass #next step: fillup Match_Regex_IdMap requested_re = session.query( Regex.id).filter(Regex.reg == res).scalar() request = session.query(Match_Regex_IdMap.id).filter_by( id_re=requested_re, id_map=row.id).scalar() if request == None and requested_re != None: #if there isn't an entry (request) and the regular_expression exists in Regex (requested_re) entry = Match_Regex_IdMap(id_re=requested_re, id_map=row.id) #create entry entry.insertMatch(session) #insert session.commit() #commit
def entry_vendor(row, email, phone, website, status_parrot): """makes entry into table vendor_analyse, returns nothing""" print('make vendor entry') #status in command line entry = Vendor_analyse( pseudo=row.pseudo, contact_information=row.contact_information, name=row.name, company=row.company, zip=row.zip, city=row.city, state=row.state, country=row.country, county=row.county, region=row.region, province=row.province, email=row.email, email_description=email, phone=row.phone, phone_description=phone, redirect_website=row.redirect_website, website_deviate=website, status_parrot=status_parrot) # tous les column sauf id entry.insertVendor_analyse(session) session.commit()
path_result='./results/getCodes/codes/' #Create the directory os.makedirs(os.path.dirname("./results/parseCodes/documentation/"), exist_ok=True) #Iterate through client codes for row in session.query(Ads_Codes).filter_by(status=0): #status=0 #Skip if already exists if session.query(exists().where(Parse_ads.ad_id == row.ad_id)).scalar(): pass else: #Copy the global variable containing the fields in the ad dic_champs=dict_champ.copy() filename=row.client_code #Set it up to use later dic_champs["Ad Number"]=row.ad_number #Obtain the HTML object objet = lxml.html.parse(f"{path_result}{filename}").getroot() #The main function that parses the HTML object dic_champs = get_champs(dic_champs, objet, doc) entry = create_entry(dic_champs, row) session.commit() entry.insertParse_ads(session) row.update(session) #Write the doc several time to lost the documentation whether the script fails. with open(f'./results/parseCodes/documentation/{date_parsing}_documentation.json', 'wb') as f: f.write(str(doc).encode('utf-8'))
#If ad (ad_id) not yet classified (0 or 1) if session.query(Parsing_bird_or_no.ad_id).filter_by(ad_id=row.ad_id).scalar() == None: #Step 1 : search in the title for each regular expression of list_of_birds for expression in list_of_birds: #The variable res is the string of the title res = re.search(str(expression), row.title) #If there is a match if res != None: #And if there isn't already an entry if session.query(Parsing_bird_or_no.status_bird).filter_by(ad_id=row.ad_id).scalar() == None: #The entry is the ad_id and the status is 1 entry = Parsing_bird_or_no(ad_id=row.ad_id, status_bird=1) entry.insertParse_bird(session) session.commit() pass #Step 2 : search in the description for each regular expression of list_of_birds for expression in list_of_birds: #If a description exists for this ad if row.description != None: try: #The variable res is the string of the description res = re.search(str(expression), row.description) except: #Otherwise raise and unknown error print('unknown error') print(row.ad_id) res = None #If there is a match