Python collect_element Beispiele, social_media_scraper.account.page_utils.collect_element Python Beispiele

Beispiel #1

0

Datei anzeigen

def collect_xing_page(data: PageContent):
    """ Gathers data rom LinkedIn page """
    outer_page = fromstring(data.outer)
    inner_page = fromstring(data.inner)
    tags = collect_tags(inner_page)
    entries = outer_page.xpath(ENTRIES_LOCATOR)
    work_entries = []
    education_entries = []
    is_work = True
    for entry in entries:
        if entry.tag == "h2" and entry.text_content(
        ) == "Educational background":
            is_work = False
            continue
        if is_work:
            work_entries.append(entry)
        else:
            education_entries.append(entry)
    work_experience = collect_work_experience(work_entries)
    education = collect_education(education_entries)
    return XingAccount(xingAccountId=data.link,
                       name=collect_element(outer_page, NAME),
                       currentPosition=collect_element(outer_page,
                                                       CURRENT_POSITION),
                       locaton=collect_element(outer_page, LOCATION),
                       haves=tags[0],
                       wants=tags[1],
                       xingWorkExperiences=work_experience,
                       xingEducations=education)

Beispiel #2

0

Datei anzeigen

def collect_education(page):
    """ Parses persons education page element and extracts data from it """
    education_records = []
    for education in lookup_element(page, EDUCATION_RECORD):
        education_records.append(
            LinkedInEducation(
                facilityName=collect_element(education, EDUCATION_FACILITY),
                degreeName=collect_element(education, EDUCATION_DEGREE),
                specialtyName=collect_element(education, EDUCATION_SPECIALTY),
                dateRange=collect_element(education, EDUCATION_DATERANGE)))
    return education_records

Beispiel #3

0

Datei anzeigen

def collect_education(educaion_entries):
    """ Collects education fom page """
    educations = []
    for education in educaion_entries:
        info_elements = lookup_element(education, INFO_SELECTOR)
        date = collect_element(education, TIME_SELECTOR)
        educations.append(
            XingEducation(degree=collect_element(education, POSITION_SELECTOR),
                          schoolName=content_check(info_elements, 2)
                          if date else content_check(info_elements, 1),
                          subject=content_check(info_elements, 3)
                          if date else content_check(info_elements, 2),
                          date=date))
    return educations

Beispiel #4

0

Datei anzeigen

def collect_work_experience(experience_entries):
    """ Collects work experience from page """
    experiences = []
    for entry_box in experience_entries:
        info_elements = lookup_element(entry_box, INFO_SELECTOR)
        description_selected = lookup_element(entry_box, DESCRIPTION_SELECTOR)
        date = collect_element(entry_box, TIME_SELECTOR)
        experiences.append(
            XingWorkExperience(
                position=collect_element(entry_box, POSITION_SELECTOR),
                companyName=content_check(info_elements, 2)
                if date else content_check(info_elements, 1),
                description=content_check(description_selected, 1),
                date=date))
    return experiences

Beispiel #5

0

Datei anzeigen

def collect_tweets(page):
    """ Collects tweet data """
    tweets = []
    for tweet in lookup_element(page, TWEETS):
        datetime_posted = get_tweet_datetime(tweet)
        is_original = check_if_retweet(tweet)
        comments_amount = parse_stat_numbers(
            collect_element(tweet, TWEET_AMOUNT_COMMENTS))
        retweets_amount = parse_stat_numbers(
            collect_element(tweet, TWEET_AMOUNT_RETWEETS))
        likes_amount = parse_stat_numbers(
            collect_element(tweet, TWEET_AMOUNT_LIKES))
        tweet_record = Tweet(text=collect_element(tweet, TWEET_TEXT),
                             datetime=datetime_posted,
                             isOriginal=is_original,
                             amountComments=comments_amount,
                             amountRetweets=retweets_amount,
                             amountLikes=likes_amount)
        tweets.append(tweet_record)
    return tweets

Beispiel #6

0

Datei anzeigen

def collect_linked_in_page(html: str, link: str):
    """ Gathers data rom LinkedIn page """
    page = fromstring(html)
    name_and_location = lookup_element(page, NAME_LOCATION_SELECTOR)
    current_position = collect_element(page, CURRENT_POSITION)
    experiences = collect_experience(page)
    education_records = collect_education(page)
    return LinkedInAccount(linkedInAccountId=link,
                           name=name_and_location[0].text_content(),
                           currentPosition=current_position,
                           locaton=name_and_location[1].text_content(),
                           linkedInWorkExperiences=experiences,
                           linkedInEducations=education_records)

Beispiel #7

0

Datei anzeigen

def collect_twitter_page(html: str, link: str):
    """ Gathers data from twitter page """
    page = fromstring(html)
    tweet_amount = collect_element(page, TWEET_AMOUNT)
    subscriptions_amount = collect_element(page, SUBSCRIPTIONS_AMOUNT)
    subscribers_amount = collect_element(page, SUBSCRIBERS_AMOUNT)
    likes_amount = collect_element(page, PROFILE_LIKES_AMOUNT)
    account = TwitterAccount(twitterAccountId=link,
                             name=collect_element(page, PROFILE_NAME),
                             atName=collect_element(page, ACCOUNT_NAME))
    account_details = TwitterAccountDetails(
        description=collect_element(page, DESCRIPTION),
        location=collect_element(page, LOCATION),
        registerDate=collect_element(page, REGISTER_DATE),
        amountTweets=parse_stat_numbers(tweet_amount),
        amountSubscriptions=parse_stat_numbers(subscriptions_amount),
        amountSubscribers=parse_stat_numbers(subscribers_amount),
        amountLikes=parse_stat_numbers(likes_amount))
    account.twitterAccountDetails = account_details
    tweets = collect_tweets(page)
    account.tweets = tweets
    return account

Beispiel #8

0

Datei anzeigen

def collect_timeline_experience(element,
                                timeline) -> List[LinkedInWorkExperience]:
    """ Parses each wotk experience timeline element and extracts data from it """
    timeline = []
    company = collect_element(element, TIMELINE_COMPANY)
    for experience in timeline:
        work_experience = LinkedInWorkExperience(
            companyName=company,
            position=collect_element(experience, TIMELINE_POSITION),
            dateRange=collect_element(experience, TIMELINE_DATERANGE),
            timeWorked=collect_element(experience, TIMELINE_DURATION),
            location=collect_element(experience, TIMELINE_LOCATION),
            description=collect_element(experience, TIMELINE_DESCRIPTION))
        timeline.append(work_experience)
    return timeline

Beispiel #9

0

Datei anzeigen

def collect_experience(page) -> List[LinkedInWorkExperience]:
    """ Parses work experience page elment and extracts data from it """
    experiences = []
    for experience in lookup_element(page, EXPERIENCE_RECORD):
        inner = lookup_element(experience, EXPERIENCE_INNER_TIMELINE)
        if inner:
            experiences.extend(collect_timeline_experience(experience, inner))
        else:
            experiences.append(
                LinkedInWorkExperience(
                    position=collect_element(experience, EXPERIENCE_POSITION),
                    companyName=collect_element(experience,
                                                EXPERIENCE_COMPANY),
                    dateRange=collect_element(experience,
                                              EXPERIENCE_DATERANGE),
                    timeWorked=collect_element(experience,
                                               EXPERIENCE_DURATION),
                    location=collect_element(experience, EXPERIENCE_LOCATION),
                    description=collect_element(experience,
                                                EXPERIENCE_DESCRIPTION)))
    return experiences