Exemple #1
0
def insert_contact(user_id, contact, cursor, connection):
    """
        :param user_id:
        :param contact:  {
                            "contact": "......",
                            "type": "EMAIL" / "NUMBER" / "ecc"
                         }
        :param connection: database connection
        :param cursor: database cursor
    """

    try:

        cursor.execute("INSERT INTO contact (contact,idOfContactType) " +
                       "values ('{}','{}')".format(
                                                    contact["contact"],
                                                    get_id_of_contact_type(contact["type"], cursor, connection)
                                                  ))

        contact_id = cursor.lastrowid

        cursor.execute("INSERT INTO usercontacts (idContact,idUser) values ({},{})".format(contact_id, user_id))

        LOGGER.debug("contact inserted correctly, returning associated id {}".format(contact_id))
        connection.commit()

    except Exception as e:

        LOGGER.error("Error while inserting contact of user {} error {}".format(user_id, e))
Exemple #2
0
def get_id_of_contact_type(contact_type, cursor, connection):
    """
        :param contact_type: contact type
        :param cursor: database cursor
        :param connection: connection cursor
        :return: id of contact type if found, creates one and returns it if not found
    """

    try:

        cursor.execute("SELECT idContactType FROM contacttype WHERE contactType = '{}'".format(contact_type.upper()))
        contact_type_id = cursor.fetchone()[0]

        LOGGER.debug("returning id of contact type: {}".format(contact_type))

        return contact_type_id

    except Exception as e:

        cursor.execute("INSERT INTO contacttype (contactType) " +
                       "values ('{}')".format(contact_type))
        connection.commit()

        contact_type_id = cursor.lastrowid

        LOGGER.debug("Contact type {} created correctly, returning associated id {}".format(contact_type
                                                                                            , contact_type_id))

        return contact_type_id
Exemple #3
0
def insert_location(location, cursor, connection):

    """
        :param location: location name as string
        :param cursor: database cursor
        :param connection: database connection
        :return: inserted location's id both if already present or not
    """

    if location == "NULL":
        return None

    try:

        cursor.execute("INSERT INTO location (locationName) values ('{}')".format(location))
        connection.commit()

        location_id = cursor.lastrowid


        LOGGER.debug("Location {} inserted correctly, returning associated id ".format(location_id))


        return cursor.lastrowid

    except mysql.connector.IntegrityError:

        cursor.execute("SELECT idLocation FROM location WHERE locationName = '{}'".format(location))
        location_id = cursor.fetchone()[0]


        LOGGER.debug("Location {} already present, returning associated id {}".format(location, location_id))


        return location_id
Exemple #4
0
def get_user_id_from_link(link, cursor):
    """
        :param link:  linkToProfile no prefix
        :param cursor: database cursor
        :return: user id
    """
    try:

        cursor.execute("SELECT idUser FROM user WHERE linkToProfile = '{}'".format(link))
        user_id = cursor.fetchone()[0]

        LOGGER.debug("User {} returning associated id ".format(link, user_id))

        return user_id


        user_id = cursor.fetchone()[0]

        LOGGER.debug("User {} already present, returning associated id {} ".format(author, user_id))

        return user_id

    except Exception as e:

        LOGGER.error("Error while getting id of user: {} error: {}".format(link, e))
Exemple #5
0
def insert_user(author, link, cursor, connection):

    """
        :param author: username
        :param link: link to facebook profile without standard prefix https://www.facebook.com
        :param cursor: database cursor
        :param connection: database connection
        :return: inserted user's id both if already present or not
    """

    try:

        cursor.execute("INSERT INTO user (username, linkToProfile, alreadyVisited, sex, idCurrentCity) " +
                       "values ('{}','{}', 0, NULL, NULL)".format(author, link))
        connection.commit()

        user_id = cursor.lastrowid

        LOGGER.debug("User {} inserted correctly, returning associated id {}".format(author, user_id))

        return cursor.lastrowid

    except mysql.connector.IntegrityError:

        cursor.execute("SELECT idUser FROM user WHERE linkToProfile = '{}'".format(link))

        user_id = cursor.fetchone()[0]

        LOGGER.debug("User {} already present, returning associated id {} ".format(author, user_id))

        return user_id
Exemple #6
0
    def re_validate(self):
        """
        checks that the last node in a random bucket is still alive
        and replace or delete it if it isn't
        """
        while True:
            time.sleep(RE_VALIDATE_INTERVAL)

            # the last node in a random, non-empty bucket
            bi = 0
            last = None
            idx_arr = [i for i in range(len(self.buckets))]
            random.shuffle(idx_arr)
            for bi in idx_arr:
                bucket = self.buckets[bi]
                if len(bucket.nodes) > 0:
                    last = bucket.nodes.pop()
                    break
            if last is not None:
                LOGGER.debug('{:5} revalidate {}'.format('', last))
                # wait for a pong
                ret = self.server.ping(last).get()
                bucket = self.buckets[bi]
                if ret:
                    # bump node
                    bucket.nodes.insert(0, last)
                else:
                    # pick a replacement
                    if len(bucket.replace_cache) > 0:
                        r = bucket.replace_cache.pop(
                            random.randint(0,
                                           len(bucket.replace_cache) - 1))
                        if r:
                            bucket.nodes.append(r)
Exemple #7
0
def is_italian_location(location):
    location = location.lstrip().rstrip().replace(" ", "+") + "+wikipedia"

    LOGGER.debug("Testing location {}".format(location))

    try:
        query = requests.get(
            "https://www.google.com/search?q={}".format(location))

        wikipedia_page = requests.get(
            re.search('https://it.wikipedia.org/.+?(?=&amp)',
                      query.text).group())
        LOGGER.debug("wikipedia page {}".format(wikipedia_page))

    except Exception as e:

        LOGGER.debug("Error while locating nation")
        LOGGER.debug("wikipedia page {}".format(wikipedia_page))

    is_italian = (
        '<span style="white-space:nowrap"><a href="/wiki/File:Flag_of_Italy.svg" class="image" title="Italia">'
        in wikipedia_page.text)

    LOGGER.debug("Location {} is italian? {}".format(location, is_italian))
    return is_italian
Exemple #8
0
def check_citta(elem):
    citta = elem.find_elements_by_xpath(".//div[@class='_6a _6b']/span/a")
    city = "NULL"
    if len(citta) > 0:
        city = citta[0].text
    LOGGER.debug(city)
    return city
Exemple #9
0
def insert_city(name, cursor, connection):

    """
        :param name: cityName
        :param cursor: database cursor
        :param connection: database connection
        :return: inserted city's id both if already present or not
    """

    try:

        cursor.execute("INSERT INTO city (cityName) " +
                       "values ('{}')".format(name))
        connection.commit()

        city_id = cursor.lastrowid

        LOGGER.debug("City {} inserted correctly, returning associated id {}".format(name, city_id))

        return city_id

    except mysql.connector.IntegrityError:

        cursor.execute("SELECT idCity FROM City WHERE cityName = '{}'".format(name))
        city_id = cursor.fetchone()[0]

        LOGGER.debug("City {} already present, returning associated id {}".format(name, city_id))

        return city_id
Exemple #10
0
def test_connection():

    LOGGER.debug("Testing db connection")

    connection = get_db_connection(constants.DB_USER, constants.DB_PASSWORD, constants.DB_HOST, constants.DB_NAME)
    connection.close()

    LOGGER.debug("Connection established successfully")
Exemple #11
0
def get_comments(element, tasto):
    comments = []
    try:

        while True:
            actions = ActionChains(driver)

            time.sleep(0.5)

            if len(element.find_elements_by_xpath(
                    ".//a[@class='_4sxc _42ft']")) == 0:
                break
            else:
                tasto = element.find_elements_by_xpath(
                    ".//a[@class='_4sxc _42ft']")

            actions.move_to_element(tasto[0]).perform()
            tasto[0].click()

        element = element.find_element_by_xpath(
            selectors.get("comment_section"))
        element = element.find_elements_by_xpath(".//div[@class='_72vr']")

        cont = 0
        cont_php = 0

        for elem in element:

            try:

                cont += 1
                href_commenti = elem.find_element_by_xpath(
                    selectors.get("comment_author")).get_attribute('href')
                author = elem.find_element_by_xpath(
                    selectors.get("comment_author")).text
                text = elem.find_element_by_xpath(
                    selectors.get("comment_text")).text
                href_finale = href_account(href_commenti)

                if ".php" not in href_finale:
                    comments.append({
                        "author": author,
                        "text": text,
                        "linkToProfile": href_finale
                    })
                else:
                    cont_php += 1

            except Exception as e:
                LOGGER.debug("{}".format(e))

        print(str(cont) + "commenti")
        print(str(cont_php) + "profili.php")

    except Exception as e:
        LOGGER.debug("{}".format(e))

    return comments
Exemple #12
0
def check_lavori(elem):
    lavori = elem.find_elements_by_xpath(
        ".//li[@class='_43c8 _5f6p fbEditProfileViewExperience experience']")
    jobs = []
    if len(lavori) > 0:
        for lavoro in lavori:
            job = lavoro.find_element_by_xpath(
                ".//div[@class='_2lzr _50f5 _50f7']/a").text
            jobs.append(job)
    LOGGER.debug("jobs")
    LOGGER.debug(jobs)
    return jobs
Exemple #13
0
def check_sesso(elem):
    genere = elem.find_element_by_xpath(
        ".//li[@class='_3pw9 _2pi4 _2ge8 _3ms8']")
    sesso = genere.find_element_by_xpath(
        ".//div[@class='clearfix']/div/span").text
    LOGGER.debug(str(sesso) + " sesso")
    sex = "NULL"
    if len(sesso) > 0:
        if sesso == "Uomo":
            sex = 1
        if sesso == "Donna":
            sex = 0
    return sex
Exemple #14
0
    def add_node(self, node):
        bucket = self.get_bucket(node)

        # exclude self
        if self.self_node.node_id == node.node_id:
            return
        # bucket contains the node, remove the old one, push the new one
        for n in list(bucket.nodes):
            if n.node_id == node.node_id:
                bucket.nodes.remove(n)
                bucket.nodes.insert(0, node)
                LOGGER.debug('{:5} bump {} in bucket #{}'.format(
                    '', node, self.buckets.index(bucket)))
                return
        # bucket is full, push node to replace cache
        if len(bucket.nodes) >= BUCKET_SIZE:
            for rc in bucket.replace_cache:
                if rc.node_id == node.node_id:
                    return

            push_node(bucket.replace_cache, node, BUCKET_SIZE)
            LOGGER.debug('{:5} push {} to replacement #{}'.format(
                '', node, self.buckets.index(bucket)))
            return
        # push node to bucket, delete node from replace cache
        push_node(bucket.nodes, node, BUCKET_SIZE)
        LOGGER.debug('{:5} push {} to bucket #{}'.format(
            '', node, self.buckets.index(bucket)))
        del_node(bucket.replace_cache, node)
        node.added_time = time.time()
Exemple #15
0
def insert_job(user_id, job, cursor, connection):

    """
        :param user_id:
        :param job:
        :param cursor:
        :param connection:
    """

    try:

        cursor.execute("INSERT INTO job (jobName) " +
                       "values ('{}')".format(job))

        job_id = cursor.lastrowid

        cursor.execute("INSERT INTO userjob (idUser,idJob) values ({},{})".format(user_id, job_id))

        LOGGER.debug("job inserted correctly")

    except Exception as e:

        LOGGER.error("Error while inserting job of user {} error {}".format(user_id, e))
Exemple #16
0
def scrape_data(save_status, href_account):
    if len(href_account) > 0:

        output = {"sex": "", "cityName": "", "contacts": [], "jobs": []}
        tasti = [
            ".//a[@class='_5pwr _84vh']", ".//a[@class='_5pwr _84vg']",
            ".//a[@class='_5pwr _84vf']"
        ]

        contatore = ""
        for i in tasti:
            try:
                driver.find_element_by_xpath(
                    ".//a[@data-tab-key='about']").click()
                contatore = i
                time.sleep(0.5)
                driver.find_element_by_xpath(i).click()
                time.sleep(1)
                info = driver.find_element_by_xpath("//div[@class='_4ms4']")
                data = []
                if contatore == tasti[2]:
                    data.append(
                        info.find_element_by_xpath(
                            "//div[@id='pagelet_contact']"))
                    data.append(
                        info.find_element_by_xpath(
                            "//div[@id='pagelet_basic']"))
                elif contatore == tasti[0]:
                    data = info.find_elements_by_xpath("//div[@class='_4qm1']")
                else:
                    data.append(
                        info.find_element_by_xpath(
                            "//div[@id='pagelet_hometown']"))
                print("data")
                print(len(data))
                time.sleep(8)
                for elem in data:
                    print("ELEMENTO:")
                    print(elem)
                    sezione = elem.find_element_by_xpath(
                        ".//div[@class='clearfix _h71']/span").text
                    LOGGER.debug(str(sezione) + " sezione")
                    if contatore == tasti[0]:
                        try:
                            if "LAVORO" in sezione:
                                output["jobs"] = check_lavori(elem)
                        except Exception as e:
                            print(str(e) + " lavoro exception")

                    if contatore == tasti[1]:
                        try:
                            output["cityName"] = check_citta(elem)
                        except Exception as e:
                            print(str(e) + " citta exception")
                    if contatore == tasti[2]:
                        try:
                            if "BASE" in sezione:
                                output["sex"] = check_sesso(elem)
                        except Exception as e:
                            print(str(e) + " sesso exception")

                        try:
                            if "CONTATTO" in sezione:
                                output["contacts"] = check_contatti(elem)
                        except Exception as e:
                            print(str(e) + " contatto exception")

            except Exception as e:
                print("Exception (scrape_data)", str(i), "Status =",
                      str(save_status),
                      sys.exc_info()[0], e)
        LOGGER.debug("Insert post")
        print("output per db")
        print(output)
        repository.insert_personal_data(href_account, output)
    else:
        print("href non valido")
Exemple #17
0
def scrap_pag():
    utils.scroll(total_scrolls, driver, selectors, scroll_time)

    #utils.scroll_to_end(driver)

    data = []
    data += driver.find_elements_by_xpath(
        "//div[@class='_4-u2 _3xaf _3-95 _4-u8']")
    data += driver.find_elements_by_xpath("//div[@class='_4-u2 _4-u8']")
    dati_post = []
    commenti_lista = []
    contatore = 0

    for element in data:
        try:
            if contatore == post_da_scrap:
                break

            LOGGER.debug("post {} of {}".format(contatore, post_da_scrap))
            LOGGER.debug("scraping {}".format(element))

            contatore += 1
            time.sleep(0.5)
            tasto = element.find_elements_by_xpath(
                ".//a[@class='_4sxc _42ft']")

            try:
                luogo = element.find_element_by_class_name("_1dwg._1w_m._q7o")\
                    .find_element_by_xpath(".//span[@class='fcg']/a").text

                if not utils.is_italian_location(luogo):
                    continue

            except Exception as e:
                LOGGER.debug("location not found {}".format(e))
                continue

            try:
                testo = element.find_elements_by_xpath(
                    ".//div[@data-testid='post_message']/p")[0].text
            except Exception as e:
                LOGGER.debug("Post has no text")

            LOGGER.debug("Retrieving comments")
            commenti = get_comments(element, tasto)

            LOGGER.debug("appending comments")
            commenti_lista.append(commenti)

            LOGGER.debug("retrieved comments")
            print({"postText": testo, "comments": commenti, "location": luogo})

            dati_post.append({
                "postText": testo,
                "comments": commenti,
                "location": luogo
            })

        except Exception as e:
            LOGGER.debug("{}".format(e))

    LOGGER.debug("{} inserting to db ".format(dati_post))

    for x in dati_post:

        LOGGER.debug("Inserting post {} num comments {}".format(
            x, len(x["comments"])))
        repository.insert_post(x)

    scrap_account_commenti()
Exemple #18
0
def insert_post(post):

    """
        :param post: dict defined as follows:
                {
                    postText: "....",
                    comments: [{...},{...},ecc] -> every comment is a dict composed as follows
                                                   {
                                                        "author": "...",
                                                        "text": "..."
                                                        "linkToProfile": "..." link to facebook profile without standard
                                                                               prefix https://www.facebook.com
                                                   }
                    location: "..."
                }
        function:Inserts the post passed as argument in the database
    """

    connection = None
    post["postText"] = post["postText"].replace("'", " ")
    try:
        connection = get_db_connection(constants.DB_USER, constants.DB_PASSWORD, constants.DB_HOST, constants.DB_NAME)
        cursor = connection.cursor()

        post_query = "INSERT INTO post (postText, postSentiment, idOfMentionedLocation) " \
                     "values ('{}',{},{})"

        location_id = insert_location(post["location"].replace("'", " "), cursor, connection)

        if location_id is None:
            cursor.execute(post_query.format(
                post["postText"].replace("'", " "),
                "NULL",  # TODO Sentiment
                "NULL"
            ))
        else:
            cursor.execute(post_query.format(
                post["postText"].replace("'", " "),
                "NULL",  # TODO Sentiment
                location_id
            ))

        post_id = cursor.lastrowid

        comment_query = "INSERT INTO comment (commentText, commentSentiment, idOfPost, idOfAuthor) " \
                        "values ('{}',{},{},{})"

        for comment in post["comments"]:

            author = comment["author"].replace("'", " ")
            text = comment["text"].replace("'", " ")
            link = comment["linkToProfile"]

            if ".php" not in link:
                cursor.execute(comment_query.format(
                                                    text,
                                                    "NULL",  # TODO Sentiment
                                                    post_id,
                                                    insert_user(author, link, cursor, connection)
                                                ))

        connection.commit()

        LOGGER.debug("Post inserted correctly")

    except Exception as e:

        connection.rollback()
        LOGGER.error("Error while inserting post: {}".format(e))

    finally:
        connection.close()