def insert_contact(user_id, contact, cursor, connection): """ :param user_id: :param contact: { "contact": "......", "type": "EMAIL" / "NUMBER" / "ecc" } :param connection: database connection :param cursor: database cursor """ try: cursor.execute("INSERT INTO contact (contact,idOfContactType) " + "values ('{}','{}')".format( contact["contact"], get_id_of_contact_type(contact["type"], cursor, connection) )) contact_id = cursor.lastrowid cursor.execute("INSERT INTO usercontacts (idContact,idUser) values ({},{})".format(contact_id, user_id)) LOGGER.debug("contact inserted correctly, returning associated id {}".format(contact_id)) connection.commit() except Exception as e: LOGGER.error("Error while inserting contact of user {} error {}".format(user_id, e))
def get_id_of_contact_type(contact_type, cursor, connection): """ :param contact_type: contact type :param cursor: database cursor :param connection: connection cursor :return: id of contact type if found, creates one and returns it if not found """ try: cursor.execute("SELECT idContactType FROM contacttype WHERE contactType = '{}'".format(contact_type.upper())) contact_type_id = cursor.fetchone()[0] LOGGER.debug("returning id of contact type: {}".format(contact_type)) return contact_type_id except Exception as e: cursor.execute("INSERT INTO contacttype (contactType) " + "values ('{}')".format(contact_type)) connection.commit() contact_type_id = cursor.lastrowid LOGGER.debug("Contact type {} created correctly, returning associated id {}".format(contact_type , contact_type_id)) return contact_type_id
def insert_location(location, cursor, connection): """ :param location: location name as string :param cursor: database cursor :param connection: database connection :return: inserted location's id both if already present or not """ if location == "NULL": return None try: cursor.execute("INSERT INTO location (locationName) values ('{}')".format(location)) connection.commit() location_id = cursor.lastrowid LOGGER.debug("Location {} inserted correctly, returning associated id ".format(location_id)) return cursor.lastrowid except mysql.connector.IntegrityError: cursor.execute("SELECT idLocation FROM location WHERE locationName = '{}'".format(location)) location_id = cursor.fetchone()[0] LOGGER.debug("Location {} already present, returning associated id {}".format(location, location_id)) return location_id
def get_user_id_from_link(link, cursor): """ :param link: linkToProfile no prefix :param cursor: database cursor :return: user id """ try: cursor.execute("SELECT idUser FROM user WHERE linkToProfile = '{}'".format(link)) user_id = cursor.fetchone()[0] LOGGER.debug("User {} returning associated id ".format(link, user_id)) return user_id user_id = cursor.fetchone()[0] LOGGER.debug("User {} already present, returning associated id {} ".format(author, user_id)) return user_id except Exception as e: LOGGER.error("Error while getting id of user: {} error: {}".format(link, e))
def insert_user(author, link, cursor, connection): """ :param author: username :param link: link to facebook profile without standard prefix https://www.facebook.com :param cursor: database cursor :param connection: database connection :return: inserted user's id both if already present or not """ try: cursor.execute("INSERT INTO user (username, linkToProfile, alreadyVisited, sex, idCurrentCity) " + "values ('{}','{}', 0, NULL, NULL)".format(author, link)) connection.commit() user_id = cursor.lastrowid LOGGER.debug("User {} inserted correctly, returning associated id {}".format(author, user_id)) return cursor.lastrowid except mysql.connector.IntegrityError: cursor.execute("SELECT idUser FROM user WHERE linkToProfile = '{}'".format(link)) user_id = cursor.fetchone()[0] LOGGER.debug("User {} already present, returning associated id {} ".format(author, user_id)) return user_id
def re_validate(self): """ checks that the last node in a random bucket is still alive and replace or delete it if it isn't """ while True: time.sleep(RE_VALIDATE_INTERVAL) # the last node in a random, non-empty bucket bi = 0 last = None idx_arr = [i for i in range(len(self.buckets))] random.shuffle(idx_arr) for bi in idx_arr: bucket = self.buckets[bi] if len(bucket.nodes) > 0: last = bucket.nodes.pop() break if last is not None: LOGGER.debug('{:5} revalidate {}'.format('', last)) # wait for a pong ret = self.server.ping(last).get() bucket = self.buckets[bi] if ret: # bump node bucket.nodes.insert(0, last) else: # pick a replacement if len(bucket.replace_cache) > 0: r = bucket.replace_cache.pop( random.randint(0, len(bucket.replace_cache) - 1)) if r: bucket.nodes.append(r)
def is_italian_location(location): location = location.lstrip().rstrip().replace(" ", "+") + "+wikipedia" LOGGER.debug("Testing location {}".format(location)) try: query = requests.get( "https://www.google.com/search?q={}".format(location)) wikipedia_page = requests.get( re.search('https://it.wikipedia.org/.+?(?=&)', query.text).group()) LOGGER.debug("wikipedia page {}".format(wikipedia_page)) except Exception as e: LOGGER.debug("Error while locating nation") LOGGER.debug("wikipedia page {}".format(wikipedia_page)) is_italian = ( '<span style="white-space:nowrap"><a href="/wiki/File:Flag_of_Italy.svg" class="image" title="Italia">' in wikipedia_page.text) LOGGER.debug("Location {} is italian? {}".format(location, is_italian)) return is_italian
def check_citta(elem): citta = elem.find_elements_by_xpath(".//div[@class='_6a _6b']/span/a") city = "NULL" if len(citta) > 0: city = citta[0].text LOGGER.debug(city) return city
def insert_city(name, cursor, connection): """ :param name: cityName :param cursor: database cursor :param connection: database connection :return: inserted city's id both if already present or not """ try: cursor.execute("INSERT INTO city (cityName) " + "values ('{}')".format(name)) connection.commit() city_id = cursor.lastrowid LOGGER.debug("City {} inserted correctly, returning associated id {}".format(name, city_id)) return city_id except mysql.connector.IntegrityError: cursor.execute("SELECT idCity FROM City WHERE cityName = '{}'".format(name)) city_id = cursor.fetchone()[0] LOGGER.debug("City {} already present, returning associated id {}".format(name, city_id)) return city_id
def test_connection(): LOGGER.debug("Testing db connection") connection = get_db_connection(constants.DB_USER, constants.DB_PASSWORD, constants.DB_HOST, constants.DB_NAME) connection.close() LOGGER.debug("Connection established successfully")
def get_comments(element, tasto): comments = [] try: while True: actions = ActionChains(driver) time.sleep(0.5) if len(element.find_elements_by_xpath( ".//a[@class='_4sxc _42ft']")) == 0: break else: tasto = element.find_elements_by_xpath( ".//a[@class='_4sxc _42ft']") actions.move_to_element(tasto[0]).perform() tasto[0].click() element = element.find_element_by_xpath( selectors.get("comment_section")) element = element.find_elements_by_xpath(".//div[@class='_72vr']") cont = 0 cont_php = 0 for elem in element: try: cont += 1 href_commenti = elem.find_element_by_xpath( selectors.get("comment_author")).get_attribute('href') author = elem.find_element_by_xpath( selectors.get("comment_author")).text text = elem.find_element_by_xpath( selectors.get("comment_text")).text href_finale = href_account(href_commenti) if ".php" not in href_finale: comments.append({ "author": author, "text": text, "linkToProfile": href_finale }) else: cont_php += 1 except Exception as e: LOGGER.debug("{}".format(e)) print(str(cont) + "commenti") print(str(cont_php) + "profili.php") except Exception as e: LOGGER.debug("{}".format(e)) return comments
def check_lavori(elem): lavori = elem.find_elements_by_xpath( ".//li[@class='_43c8 _5f6p fbEditProfileViewExperience experience']") jobs = [] if len(lavori) > 0: for lavoro in lavori: job = lavoro.find_element_by_xpath( ".//div[@class='_2lzr _50f5 _50f7']/a").text jobs.append(job) LOGGER.debug("jobs") LOGGER.debug(jobs) return jobs
def check_sesso(elem): genere = elem.find_element_by_xpath( ".//li[@class='_3pw9 _2pi4 _2ge8 _3ms8']") sesso = genere.find_element_by_xpath( ".//div[@class='clearfix']/div/span").text LOGGER.debug(str(sesso) + " sesso") sex = "NULL" if len(sesso) > 0: if sesso == "Uomo": sex = 1 if sesso == "Donna": sex = 0 return sex
def add_node(self, node): bucket = self.get_bucket(node) # exclude self if self.self_node.node_id == node.node_id: return # bucket contains the node, remove the old one, push the new one for n in list(bucket.nodes): if n.node_id == node.node_id: bucket.nodes.remove(n) bucket.nodes.insert(0, node) LOGGER.debug('{:5} bump {} in bucket #{}'.format( '', node, self.buckets.index(bucket))) return # bucket is full, push node to replace cache if len(bucket.nodes) >= BUCKET_SIZE: for rc in bucket.replace_cache: if rc.node_id == node.node_id: return push_node(bucket.replace_cache, node, BUCKET_SIZE) LOGGER.debug('{:5} push {} to replacement #{}'.format( '', node, self.buckets.index(bucket))) return # push node to bucket, delete node from replace cache push_node(bucket.nodes, node, BUCKET_SIZE) LOGGER.debug('{:5} push {} to bucket #{}'.format( '', node, self.buckets.index(bucket))) del_node(bucket.replace_cache, node) node.added_time = time.time()
def insert_job(user_id, job, cursor, connection): """ :param user_id: :param job: :param cursor: :param connection: """ try: cursor.execute("INSERT INTO job (jobName) " + "values ('{}')".format(job)) job_id = cursor.lastrowid cursor.execute("INSERT INTO userjob (idUser,idJob) values ({},{})".format(user_id, job_id)) LOGGER.debug("job inserted correctly") except Exception as e: LOGGER.error("Error while inserting job of user {} error {}".format(user_id, e))
def scrape_data(save_status, href_account): if len(href_account) > 0: output = {"sex": "", "cityName": "", "contacts": [], "jobs": []} tasti = [ ".//a[@class='_5pwr _84vh']", ".//a[@class='_5pwr _84vg']", ".//a[@class='_5pwr _84vf']" ] contatore = "" for i in tasti: try: driver.find_element_by_xpath( ".//a[@data-tab-key='about']").click() contatore = i time.sleep(0.5) driver.find_element_by_xpath(i).click() time.sleep(1) info = driver.find_element_by_xpath("//div[@class='_4ms4']") data = [] if contatore == tasti[2]: data.append( info.find_element_by_xpath( "//div[@id='pagelet_contact']")) data.append( info.find_element_by_xpath( "//div[@id='pagelet_basic']")) elif contatore == tasti[0]: data = info.find_elements_by_xpath("//div[@class='_4qm1']") else: data.append( info.find_element_by_xpath( "//div[@id='pagelet_hometown']")) print("data") print(len(data)) time.sleep(8) for elem in data: print("ELEMENTO:") print(elem) sezione = elem.find_element_by_xpath( ".//div[@class='clearfix _h71']/span").text LOGGER.debug(str(sezione) + " sezione") if contatore == tasti[0]: try: if "LAVORO" in sezione: output["jobs"] = check_lavori(elem) except Exception as e: print(str(e) + " lavoro exception") if contatore == tasti[1]: try: output["cityName"] = check_citta(elem) except Exception as e: print(str(e) + " citta exception") if contatore == tasti[2]: try: if "BASE" in sezione: output["sex"] = check_sesso(elem) except Exception as e: print(str(e) + " sesso exception") try: if "CONTATTO" in sezione: output["contacts"] = check_contatti(elem) except Exception as e: print(str(e) + " contatto exception") except Exception as e: print("Exception (scrape_data)", str(i), "Status =", str(save_status), sys.exc_info()[0], e) LOGGER.debug("Insert post") print("output per db") print(output) repository.insert_personal_data(href_account, output) else: print("href non valido")
def scrap_pag(): utils.scroll(total_scrolls, driver, selectors, scroll_time) #utils.scroll_to_end(driver) data = [] data += driver.find_elements_by_xpath( "//div[@class='_4-u2 _3xaf _3-95 _4-u8']") data += driver.find_elements_by_xpath("//div[@class='_4-u2 _4-u8']") dati_post = [] commenti_lista = [] contatore = 0 for element in data: try: if contatore == post_da_scrap: break LOGGER.debug("post {} of {}".format(contatore, post_da_scrap)) LOGGER.debug("scraping {}".format(element)) contatore += 1 time.sleep(0.5) tasto = element.find_elements_by_xpath( ".//a[@class='_4sxc _42ft']") try: luogo = element.find_element_by_class_name("_1dwg._1w_m._q7o")\ .find_element_by_xpath(".//span[@class='fcg']/a").text if not utils.is_italian_location(luogo): continue except Exception as e: LOGGER.debug("location not found {}".format(e)) continue try: testo = element.find_elements_by_xpath( ".//div[@data-testid='post_message']/p")[0].text except Exception as e: LOGGER.debug("Post has no text") LOGGER.debug("Retrieving comments") commenti = get_comments(element, tasto) LOGGER.debug("appending comments") commenti_lista.append(commenti) LOGGER.debug("retrieved comments") print({"postText": testo, "comments": commenti, "location": luogo}) dati_post.append({ "postText": testo, "comments": commenti, "location": luogo }) except Exception as e: LOGGER.debug("{}".format(e)) LOGGER.debug("{} inserting to db ".format(dati_post)) for x in dati_post: LOGGER.debug("Inserting post {} num comments {}".format( x, len(x["comments"]))) repository.insert_post(x) scrap_account_commenti()
def insert_post(post): """ :param post: dict defined as follows: { postText: "....", comments: [{...},{...},ecc] -> every comment is a dict composed as follows { "author": "...", "text": "..." "linkToProfile": "..." link to facebook profile without standard prefix https://www.facebook.com } location: "..." } function:Inserts the post passed as argument in the database """ connection = None post["postText"] = post["postText"].replace("'", " ") try: connection = get_db_connection(constants.DB_USER, constants.DB_PASSWORD, constants.DB_HOST, constants.DB_NAME) cursor = connection.cursor() post_query = "INSERT INTO post (postText, postSentiment, idOfMentionedLocation) " \ "values ('{}',{},{})" location_id = insert_location(post["location"].replace("'", " "), cursor, connection) if location_id is None: cursor.execute(post_query.format( post["postText"].replace("'", " "), "NULL", # TODO Sentiment "NULL" )) else: cursor.execute(post_query.format( post["postText"].replace("'", " "), "NULL", # TODO Sentiment location_id )) post_id = cursor.lastrowid comment_query = "INSERT INTO comment (commentText, commentSentiment, idOfPost, idOfAuthor) " \ "values ('{}',{},{},{})" for comment in post["comments"]: author = comment["author"].replace("'", " ") text = comment["text"].replace("'", " ") link = comment["linkToProfile"] if ".php" not in link: cursor.execute(comment_query.format( text, "NULL", # TODO Sentiment post_id, insert_user(author, link, cursor, connection) )) connection.commit() LOGGER.debug("Post inserted correctly") except Exception as e: connection.rollback() LOGGER.error("Error while inserting post: {}".format(e)) finally: connection.close()