def NextpageDuespaghi(url, page): if gL.trace: gL.log(gL.DEBUG) try: # DUESPAGHI - PAGINAZIONE - RICEVE UNA PAGINA, E RESTITUISCE URL DELLA NEXT o = urlparse(url) found = re.search('pag=(.+?)&', o.query).group(1) if found is not None: # l'url della paginazione nx = int(found) + 1 url_a = "http://" + o.hostname + o.path + "?pag=" + str(nx) + "&ord=relevance&dir=desc" # controlla che esista rc, page = ReadPage(url_a) if rc != 0 or page is None: return False, '' #chkstr = '//*[@class="row-identity-container"]/a/@href' chkstr = '//*[@class="disabled"]//i//@class' # Element='<i class="fa fa-chevron-right" />' test = page.xpath(chkstr) # le pagine esistono ma non hanno contenuto if "fa fa-chevron-right" not in test: return url_a, page except Exception as err: url_a = "http://" + o.hostname + o.path + "?pag=2&ord=relevance&dir=desc" # se non trovo il numero pagina vuol dire che è la prima pagina, # controlla che esista rc, newpage = ReadPage(url_a) if rc == 0 and newpage is not None: test = newpage.xpath('//*[@class="row-identity-container"]/a/@href') # controllo che la seconda esista con del contenuto if test: return url_a, newpage else: return False, '' return False, ''
def dbAssetPrice(Asset, PriceList, currency): if gL.trace: gL.log(gL.DEBUG) try: # cancella e riscrive la classificazione dell'asset if len(PriceList)>0: PriceCurr = "" PriceFrom = 0 PriceTo = 0 PriceAvg = 0 for i in PriceList: if i[0] == 'PriceCurr': PriceCurr = i[1] if i[0] == 'PriceFrom': PriceFrom = i[1] if i[0] == 'PriceTo': PriceTo = i[1] if i[0] == 'PriceAvg': PriceAvg = i[1] if PriceCurr == '': PriceCurr = currency if PriceFrom == 0 and PriceTo == 0 and PriceAvg == 0: pass else: gL.cMySql.execute("Delete from AssetPrice where Asset = %s ", ([Asset])) gL.cMySql.execute("Insert into AssetPrice(Asset, PriceCurrency, PriceFrom, PriceTo, PriceAvg) Values (%s, %s, %s, %s, %s)", (Asset, PriceCurr, PriceFrom, PriceTo, PriceAvg)) return True except Exception as err: gL.log(gL.ERROR, err) return False
def OpenConnectionSqlite(): if gL.trace: gL.log(gL.DEBUG) if not gL.SqLite: gL.SqLite = sqlite3.connect(':memory:') gL.cLite = gL.SqLite.cursor() return gL.SqLite, gL.cLite
def QueueViamichelin(country, assettype, source, starturl, pageurl, page): if gL.trace: gL.log(gL.DEBUG) try: #lista = page.xpath('//a[@class="clearfix"]') # funziona href = page.xpath('//a[@class="parseHref jsNodePoiLink"]//@href') test = page.xpath('//h2[@class="parseInnerText jsNodePoiTitle"]//text()') # togli i nomi vuoti nomi = [] for item in test: if item.replace(" ","") != '': nomi.append(item) if len(nomi) > len(href): msg ="%s - %s" % ("Errore nel parsing dei nomi o di href", url) gL.log(gL.ERROR, msg) return False if nomi is None or href is None: msg ="%s - %s" % ("Parsing dei nomi / href senza risultati", url) gL.log(gL.ERROR, msg) return False n = 0 for asset in nomi: if not href[n]: continue name = gL.StdName(nomi[n]) url = gL.SourceBaseUrl + href[n] rc = gL.dbEnqueue(country, assettype, source, starturl, pageurl, url, name) n = n + 1 # next asset except Exception as err: gL.log(gL.ERROR, pageurl) gL.log(gL.ERROR, err) return False return True
def UpdDriveRun(startend): if gL.trace: gL.log(gL.DEBUG) try: if startend == "START": gL.cMySql.execute("Update Drive set RunDate = %s where active = True", ([gL.RunDate])) if startend == "END": gL.cMySql.execute("Update Drive set RunDate_end = %s where active = True", ([gL.SetNow()])) except Exception as err: gL.log(gL.ERROR, err) return False
def ParseAsset(country, assettype, source, starturl, pageurl, asseturl, name): if gL.trace: gL.log(gL.DEBUG) # parse delle singole pagine degli asset gL.dbQueueStatus("START", country, assettype, source, starturl, pageurl, asseturl) # scrivo nella coda che inizio Asset = gL.ParseContent(country, assettype, source, starturl, asseturl, name) if Asset: # se tutto ok gL.dbQueueStatus("END", country, assettype, source, starturl, pageurl, asseturl) # scrivo nella coda che ho finito else: return False return True
def RunIdCreate(RunType): if gL.trace: gL.log(gL.DEBUG) try: runid = 0 gL.cMySql.execute("Insert into Run (Start, RunType) Values (%s, %s)", (gL.SetNow(), RunType)) run = gL.cMySql.lastrowid # recupera id autonum generato if run is None: raise Exception("Get autonum generato con errore") return run except Exception as err: return False
def ParseNextPage(source, assettype, country, pageurl, page): if gL.trace: gL.log(gL.DEBUG) try: fn = gL.GetFunzione("NEXT", source, assettype, country) if not fn: raise Exception("Funzione NEXT non trovata") return globals()[fn](pageurl, page) except Exception as err: gL.log(gL.ERROR, err) return False
def BuildQueue(country, assettype, source, starturl, pageurl, page): if gL.trace: gL.log(gL.DEBUG) try: fn = gL.GetFunzione("QUEUE", source, assettype, country) if not fn: raise Exception("Funzione QUEUE non trovata") return globals()[fn](country, assettype, source, starturl, pageurl, page) except Exception as err: gL.log(gL.ERROR, err) return False
def DumpGoogleResults(Asset, name, indirizzo, chk): if gL.trace: gL.log(gL.DEBUG) if len(chk) == 0: return for item in chk: gL.cMySql.execute("Delete from Debug_GoogleResults where Asset = %s", ([Asset])) break for item in chk: gL.cMySql.execute("Insert into Debug_GoogleResults(Asset, AssetName, AssetAddress, GblRatio, Nome, Address, NameRatio, StreetRatio) \ Values (%s, %s, %s, %s, %s, %s, %s, %s)", \ ( Asset, name, indirizzo, item[0], item[2], item[3], item[4], item[5])) return True
def LoadProxyList(): if gL.trace: gL.log(gL.DEBUG) try: gL.cMySql.execute("Select * from RunProxies where Active = %s", ([gL.YES]) ) proxies = gL.cMySql.fetchall() if len(proxies) == 0: return False for proxy in proxies: gL.Proxies.append(proxy[0]) return True except Exception as err: gL.log(gL.ERROR, err) return False
def RunIdStatus(startend): if gL.trace: gL.log(gL.DEBUG) try: if startend == "START": gL.cMySql.execute("Update Run set Start = %s where RunId = %s ", (gL.SetNow(), gL.RunId)) if startend == "END": gL.cMySql.execute("Update Run set End = %s where RunId = %s ", (gL.SetNow(), gL.RunId)) return True except Exception as err: #gL.log(gL.ERROR, err) return False
def dbQueueStatus(startend, country, assettype, source, starturl, pageurl, asseturl): if gL.trace: gL.log(gL.DEBUG) try: if startend == "START": gL.cMySql.execute("Update queue set Start=%s, End=0, RunId=%s where Country=%s and AssetType=%s and Source=%s and Starturl=%s and Pageurl=%s and AssetUrl=%s", \ (gL.SetNow(), gL.RunId, country, assettype, source, starturl, pageurl, asseturl)) if startend == "END": gL.cMySql.execute("Update queue set End=%s, RunId=%s where Country=%s and AssetType=%s and Source=%s and Starturl=%s and Pageurl=%s and AssetUrl=%s", \ (gL.SetNow(), gL.RunId, country, assettype, source, starturl, pageurl, asseturl)) except Exception as err: gL.log(gL.ERROR, (str(source)+ str(assettype) + country + starturl + pageurl + asseturl), err) return False return True
def dbAssetOpening(Asset, orario): if gL.trace: gL.log(gL.DEBUG) try: gL.cMySql.execute("Delete from AssetOpening where Asset = %s ", ([Asset])) for j in orario: x = j[1][:2]+":"+j[1][2:] y = j[2][:2]+":"+j[2][2:] gL.cMySql.execute("Insert into AssetOpening(Asset, WeekDay, OpenFrom, OpenTo) Values (%s, %s, %s, %s)", \ (Asset, j[0], x, y)) return True except Exception as err: gL.log(gL.ERROR, err) return False
def ParseGooglePlacesMain(Asset, AAsset): if gL.trace: gL.log(gL.DEBUG) try: gL.cMySql.execute("Select * from QAddress where Asset = %s", ([Asset])) row = gL.cMySql.fetchone() if not row: gL.log(gL.ERROR, "asset:" + str(Asset)) return False country = row['Country'] assettype = row['AssetType'] source = row['Source'] starturl = row['StartUrl'] asseturl = row['AssetUrl'] name = row['name'] address = row['address'] addrstreet = row['addrstreet'] addrcity = row['addrcity'] addrzip = row['addrzip'] addrcounty = row['addrcounty'] gAsset = gL.ParseGooglePlaces(Asset, assettype, name, gL.xstr(addrstreet), gL.xstr(addrzip), gL.xstr(addrcity), gL.xstr(country), gL.xstr(address), AAsset ) return gAsset except Exception as err: gL.log(gL.ERROR, "asset:" + str(Asset)) gL.log(gL.ERROR, err) return False
def GetFunzione(tipo, source, assettype, country): if gL.trace: gL.log(gL.DEBUG) try: for k in gL.Funzioni: if k['source'] == source and k['assettype'] == assettype and k['country'] == country: if tipo == "PARSE": return k['ParseFn'] if tipo == "QUEUE": return k['QueueFn'] if tipo == "NEXT": return k['NextPageFn'] except Exception as err: gL.log(gL.ERROR, err) return False
def StdPhone(stringa, country): if gL.trace: gL.log(gL.DEBUG) try: test = stringa.split(' - ') # due numeri di tel separati da trattino if len(test) > 1: stringa = test[0] ISO = gL.CountryISO.get(country) if ISO is None: gL.cMySql.execute("select CountryIso2 from T_Country where Country = %s", ([country])) row = gL.cMySql.fetchone() if row: ISO = row['CountryIso2'] gL.CountryISO[country] = ISO if ISO is None: gL.log(gL.ERROR, "Lingua non trovata") return False except: gL.log(gL.ERROR, stringa) gL.log(gL.ERROR, err) return False, False # formatta telefono try: newphone = '' ; newphone1 = '' ; idx = 0 numeri = phonenumbers.PhoneNumberMatcher(stringa, ISO) while numeri.has_next(): idx = idx + 1 match = numeri.next() #print(phonenumbers.format_number(b.number, phonenumbers.PhoneNumberFormat.INTERNATIONAL)) if idx == 1: newphone = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.INTERNATIONAL) newphone = newphone.replace('(','') newphone = newphone.replace(')','') if idx == 2: #match = phonenumbers.parse(stringa, ISO) newphone1 = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.INTERNATIONAL) #newphone = phonenumbers.format_number(y, phonenumbers.PhoneNumberFormat.INTERNATIONAL) newphone1 = newphone1.replace('(','') newphone1 = newphone1.replace(')','') except: msg ="%s - %s" % ("Phone stdz error", stringa) gL.log(gL.ERROR, msg) newphone = stringa return False, False return (newphone, newphone1)
def StdAddress(AddrStreet, AddrZIP, AddrCity, AddrCountry, indirizzo=''): if gL.trace: gL.log(gL.DEBUG) gL.GmapNumcalls = gL.GmapNumcalls + 1 AddrRegion = '' AddrLat = 0 AddrCounty = '' AddrLong = 0 FormattedAddress = '' if indirizzo == '': indirizzo = xstr(AddrStreet) + " " + xstr(AddrZIP) + " " + xstr(AddrCity) + " " + xstr(AddrCountry) try: while True: results = Geocoder.geocode(indirizzo) if results is None: msg = "Indirizzo: " + indirizzo + " non trovato" gL.log(gL.WARNING, msg) return (False, AddrStreet, AddrCity, AddrZIP, 0, 0, '', '', '') if results.count > 0: result = results[0] # solo il primo valore ritornato AddrCounty = "" for component in result.current_data['address_components']: a = component['types'] if a: if a[0] == "administrative_area_level_2": AddrCounty = component['short_name'] break if result.route and result.street_number: AddrStreet = result.route + " " + result.street_number AddrCity = result.locality AddrZIP = result.postal_code if result.coordinates[0]: AddrLat = result.coordinates[0] if result.coordinates[1]: AddrLong = result.coordinates[1] if result.administrative_area_level_1: AddrRegion = result.administrative_area_level_1 if result.formatted_address: FormattedAddress = result.formatted_address return True, AddrStreet, AddrCity, AddrZIP, AddrLat, AddrLong, AddrRegion, AddrCounty, FormattedAddress else: return (False, AddrStreet, AddrCity, AddrZIP, 0, 0, '', '', '') except GeocoderError as err: if err.status == "ZERO_RESULT": indirizzo = nome + " " + indirizzo return (False, AddrStreet, AddrCity, AddrZIP, 0, 0, '', '', '')
def dbAssetReview(Asset, r): if gL.trace: gL.log(gL.DEBUG) try: if len(r) == 0: return True gL.cMySql.execute("Delete from AssetReview where Asset = %s", ([Asset])) for a in r: nreview = int(a[0]) punt = int(a[1]) gL.cMySql.execute("Insert into AssetReview(Asset, EvalPoint, EvalNum) Values (%s,%s,%s)", (Asset, punt, nreview)) return True except Exception as err: gL.log(gL.ERROR, err) return False
def dbLastReviewDate(Asset, LastReviewDate): if gL.trace: gL.log(gL.DEBUG) try: # aggiorna la data di ultima recensione gL.cMySql.execute("select LastReviewDate from Asset where Asset=%s", ([Asset])) row = gL.cMySql.fetchone() if row is None: raise Exception("Errore: Asset non trovato") CurLastReviewDate = row['LastReviewDate'] if CurLastReviewDate is None or (CurLastReviewDate < LastReviewDate): gL.cMySql.execute("Update Asset set LastReviewDate=%s where Asset=%s", (LastReviewDate, Asset)) return True except Exception as err: gL.log(gL.ERROR, err) return False
def Restart(RunType): try: if gL.trace: gL.log(gL.DEBUG) gL.restart = False # determino se devo restartare - prendo l'ultimo record della tabella run gL.cMySql.execute("SELECT RunId, Start, End FROM Run where RunType = %s GROUP BY RunId, Start, End ORDER BY RunId DESC", ([RunType])) check = gL.cMySql.fetchone() if check: # se esiste un record in Run runid = check['RunId'] end = check['End'] start = check['Start'] if end is None or end < start: gL.restart = True return runid return 0 except Exception as err: return False
def ParseContent(country, assettype, source, starturl, asseturl, name): if gL.trace: gL.log(gL.DEBUG) try: Asset = gL.dbAsset(country, assettype, source, name, asseturl) # inserisco l'asset if Asset == 0: raise Exception("Errore nella creazione dell'asset") fn = gL.GetFunzione("PARSE", source, assettype, country) if not fn: raise Exception("Funzione PARSE non trovata") rc = globals()[fn](country, asseturl, name, Asset) if rc: return Asset else: raise Exception("Funzione PARSE con errori") except Exception as err: gL.log(gL.ERROR, err) return False
def sql_RestartUrl(country, assettype, source, rundate, starturl="", pageurl=""): if gL.trace: gL.log(gL.DEBUG) try: # se richiesto il restart prendo l'ultimo record di paginazione creato nel run precedente gL.cMySql.execute( ("SELECT StartUrl, PageUrl, max(InsertDate) FROM Queue where \ country = %s and assetTypeId = %s and Source = %s and RunDate = %s and StartUrl is NOT NULL and PageUrl IS NOT NULL and AssetUrl='' \ group by starturl, pageurl order by InsertDate desc"),\ (country, assettype, source, rundate) ) a = gL.cMySql.fetchone() if a is not None: starturl = a['StartUrl'] pageurl = a['PageUrl'] return starturl, pageurl else: return False except Exception as err: gL.log(gL.ERROR, err) return False
def SaveContent(url, content): if gL.trace: gL.log(gL.DEBUG) CurContent = '' sql = "Select * from AssetContent where Url = '" + url + "'" gL.cMySql.execute(sql) check = gL.cMySql.fetchone() try: if check is not None: CurContent = check['Content'] if CurContent != content: gL.cMySql.execute("Update AssetContent set Content=%s, RunId=%s where url=%s", (content, gL.RunId, url)) else: gL.cMySql.execute("Insert into AssetContent(Url, Content, RunId) Values (%s, %s, %s)", \ (url, content, gL.RunId)) except Exception as err: gL.log(gL.ERROR, err) return False return True
def dbAssetTag(Asset, tag, tagname): if gL.trace: gL.log(gL.DEBUG) try: # cancella e riscrive la classificazione dell'asset if len(tag)>0: tag = list(set(tag)) # rimuovo duplicati dalla lista #gL.cMySql.execute("Delete from AssetTag where Asset = %s and TagName = %s", (Asset, tagname)) for i in tag: i = gL.StdCar(i) if len(i) < 2: continue gL.cMySql.execute("Select * from AssetTag where Asset=%s and TagName=%s and Tag=%s", (Asset, tagname, i)) a = gL.cMySql.fetchone() if a is None: gL.cMySql.execute("Insert into AssetTag(Asset, TagName, Tag) Values (%s, %s, %s)", (Asset, tagname, i)) return True except Exception as err: gL.log(gL.ERROR, err) return False
def NextpageTripadvisor(url, page): if gL.trace: gL.log(gL.DEBUG) try: # get la prossima pagina lista e inseriscila nella coda di lavoro e nella # tabella starturl # per tutti i link rel next links = page.xpath('//link[@rel="next"]/@href') for link in links: # link = gL.assetbaseurl + link # controllo che esista e l'inserisco nella coda url = gL.assetbaseurl + link rc, newpage = ReadPage(url) if rc == 0 and newpage is not None: return url, newpage except Exception as err: gL.log(gL.ERROR, url, err) return False, '' return False, ''
def NextpageQristoranti(url, page): if gL.trace: gL.log(gL.DEBUG) try: # get la prossima pagina lista e inseriscila nella coda di lavoro e nella # tabella starturl # per tutti i link rel next pagact = page.xpath('//span[@class="inactive"]/text()') # pagina attuale, se zero non c'è paginazione, if len(pagact) == 0: return False, '' curpa = int(pagact[0]) links = page.xpath('//a[@class="paginate"]/@href') numpa = page.xpath('//a[@class="paginate"]/text()') if numpa[0] is not None: if int(numpa[0]) > int(curpa): rc, newpage = ReadPage(links[0]) if rc == 0 and newpage is not None: return(links[0], newpage) else: return False, '' else: return False, '' except Exception as err: gL.log(gL.ERROR, url) gL.log(gL.ERROR, err) return False, '' return False, ''
def ParseArgs(): if gL.trace: gL.log(gL.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('-test', action='store_true', default=False, dest='test', help='Decide se il run e di test, e cambia il DNS del database in DsnTest') parser.add_argument('-url', action='store', default='', dest='testurl', help="Esamina solo l'url") parser.add_argument('-debug', action='store_true', default='', dest='debug', help="Dump tabelle interne su Db") parser.add_argument('-trace', action='store_true', default='', dest='trace', help="Traccia sul log tutte le chiamate alle funzioni") parser.add_argument('-resetnames', action='store_true', default='', dest='resetnames', help="Inizializza tutti i nomi standard prima di una nuova standardizzazione dei nomi. Esclusi i nomi modificati a mano") args = parser.parse_args() if args.test: gL.testrun = True gL.Dsn = gL.Tst_MsAccDsn print("RUN DI TEST!!!!") else: gL.testrun = False gL.Dsn = gL.Prd_MsAccDsn print("RUN EFFETTIVO") if args.testurl: gL.testurl = args.testurl if args.debug: gL.debug = True if args.trace: gL.trace = True if args.resetnames: gL.resetnames = True gL.Args = args return True
def dbAsset(country, assettype, source, name, url, AAsset=0, GooglePid=''): if gL.trace: gL.log(gL.DEBUG) try: tag = [] msg = "%s %s(%s) - %s - %s" % ('Asset:', gL.N_Ass, gL.T_Ass, name.encode('utf-8'), url.encode('utf-8')) gL.log(gL.INFO, msg) if GooglePid == '': gL.cMySql.execute("Select * from Asset where Url = %s", ([url])) CurAsset = gL.cMySql.fetchone() else: gL.cMySql.execute("Select * from Asset where GooglePid = %s", ([GooglePid])) CurAsset = gL.cMySql.fetchone() if CurAsset is not None: # se e' gia' presente lo aggiorno Asset = int(CurAsset['Asset']) gL.cMySql.execute("Update Asset set Name=%s, Updated=%s where Asset=%s", (name, gL.SetNow(), Asset)) else: # se no lo inserisco gL.cMySql.execute( "Insert into Asset(Source, AssetType, Country, Url, Name, Created, Updated, Active, GooglePid, AAsset) \ Values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", \ ( source, assettype, country, url, name, gL.RunDate, gL.SetNow(), gL.YES, GooglePid, AAsset)) Asset = gL.cMySql.lastrowid if Asset is None: raise Exception("Get autonum errato") return Asset except Exception as err: gL.log(gL.ERROR, err) return 0
def BuildAssetList(country, assettype, source, starturl, pageurl, runlogid): if gL.trace: gL.log(gL.DEBUG) try: # inizia da starturl e interpreta le pagine di lista costruendo la coda degli asset da esaminare work_queue.append((pageurl, "")) while len(work_queue): pageurl, newpage = work_queue.popleft() msg ="%s - %s" % ("PAGINATE", pageurl) gL.log(gL.INFO, msg) if newpage == '': rc, page = gL.ReadPage(pageurl) else: rc = 0 page = newpage if rc == 0 and page is not None: # inserisce la pagina da leggere nel runlog rc = gL.PagesStatus("START", country, assettype, source, starturl, pageurl) # legge la pagina lista, legge i link alle pagine degli asset e li inserisce nella queue rc = gL.BuildQueue(country, assettype, source, starturl, pageurl, page) # aggiorna il log del run con la data di fine esame della pagina gL.PagesStatus("END", country, assettype, source, starturl, pageurl) # legge la prossima pagina lista newpageurl, newpage = gL.ParseNextPage(source, assettype, country, pageurl, page) if newpageurl: #gL.sql_Queue(country, assettype, source, starturl, newpageurl) # inserisce nella coda work_queue.append((newpageurl, newpage)) gL.PagesCreate(source, assettype, country, starturl, newpageurl) except Exception as err: gL.log(gL.ERROR, err) return False return True