Ejemplo n.º 1
0
    def _get_living(self, profile: NetworkProfile):
        """住址信息"""
        try:
            # education
            # https://www.facebook.com/profile.php?id=100030846743121&sk=about&section=overview&lst=100013325533097%3A100030846743121%3A1568790537

            url: str = "https://www.facebook.com/profile.php?id={}&sk=about&section=living&lst={}%3A{}%3A{}".format(
                profile._userid, self._userid, profile._userid,
                helper_time.ts_since_1970(10))

            html = self._ha.getstring(url,
                                      headers="""
            accept: */*
            accept-encoding: gzip, deflate
            accept-language: en-US,en;q=0.9
            cache-control: no-cache
            content-type: application/x-www-form-urlencoded
            origin: https://www.facebook.com
            pragma: no-cache
            referer: {}
            sec-fetch-mode: cors
            sec-fetch-site: same-origin""".format(profile.url))

            if html is None:
                return

            address = helper_str.substring(
                html, 'data-hovercard-prefer-more-content-show="1">', '<')
            if address:
                profile.address = address

        except Exception:
            self._logger.error(
                "Get education page failed: username:{} url:{}".format(
                    profile._networkid, profile.url))
Ejemplo n.º 2
0
    def _get_education(self, profile: NetworkProfile):
        """教育工作信息"""
        try:
            url: str = "https://www.facebook.com/profile.php?id={}&sk=about&section=education&lst={}%3A{}%3A{}".format(
                profile._userid, self._userid, profile._userid,
                helper_time.ts_since_1970(10))

            html = self._ha.getstring(url,
                                      headers="""
            accept: */*
            accept-encoding: gzip, deflate
            accept-language: en-US,en;q=0.9
            cache-control: no-cache
            content-type: application/x-www-form-urlencoded
            origin: https://www.facebook.com
            pragma: no-cache
            referer: {}
            sec-fetch-mode: cors
            sec-fetch-site: same-origin""".format(profile.url))

            if html is None:
                return

            # m = self._re_info_work.search(html)
            m = self._re_info.search(html)
            if m is None:
                self._logger.debug("No education info found: {} {}".format(
                    profile.nickname, profile.url))
                return

            strdiv: str = m.group(1).strip()
            if not isinstance(strdiv, str) or strdiv == "":
                self._logger.debug("Get education info failed: {} {}".format(
                    profile.nickname, profile.url))
                return

            m = self._re_edu_work.search(strdiv)
            if not m is None:
                ulwork = m.group(1).strip()
                if not ulwork is None and ulwork != "":
                    self.__parse_edu_work(profile, ulwork)

            m = self._re_edu_skill.search(strdiv)
            if not m is None:
                ulskill = m.group(1).strip()
                if not ulskill is None and ulskill != "":
                    self.__parse_edu_skill(profile, ulskill)

            m = self._re_edu_edu.search(strdiv)
            if not m is None:
                uledu = m.group(1).strip()
                if not uledu is None and uledu != "":
                    self.__parse_edu_edu(profile, uledu)

        except Exception:
            self._logger.error(
                "Get education page failed: username:{} url:{}\nerror: {}".
                format(profile._networkid, profile.url,
                       traceback.format_exc()))
Ejemplo n.º 3
0
    def _get_bio(self, profile: NetworkProfile):
        """住址信息"""
        try:
            # education
            # https://www.facebook.com/profile.php?id=100030846743121&sk=about&section=overview&lst=100013325533097%3A100030846743121%3A1568790537

            url: str = "https://www.facebook.com/profile.php?id={}&sk=about&section=bio&lst={}%3A{}%3A{}".format(
                profile._userid, self._userid, profile._userid,
                helper_time.ts_since_1970(10))

            html = self._ha.getstring(url,
                                      headers="""
            accept: */*
            accept-encoding: gzip, deflate
            accept-language: en-US,en;q=0.9
            cache-control: no-cache
            content-type: application/x-www-form-urlencoded
            origin: https://www.facebook.com
            pragma: no-cache
            referer: {}
            sec-fetch-mode: cors
            sec-fetch-site: same-origin""".format(profile.url))

            if html is None:
                return

            m = self._re_bio_othernames.search(html)
            if not m is None:
                ul = m.group(1).strip()
                self.__parse_bio_othernames(profile, ul)

            m = self._re_bio_favorites.search(html)
            if not m is None:
                ul = m.group(1).strip()
                self.__parse_bio_favorites(profile, ul)

            m = self._re_bio_about.search(html)
            if not m is None:
                ul = m.group(1).strip()
                self.__parse_bio_about(profile, ul)

        except Exception:
            self._logger.error(
                "Get bio page failed: username:{} url:{}".format(
                    profile._networkid, profile.url))
Ejemplo n.º 4
0
    def _get_relation(self, profile: NetworkProfile):
        """家庭关系"""
        try:
            url: str = "https://www.facebook.com/profile.php?id={}&sk=about&section=relationship&lst={}%3A{}%3A{}".format(
                profile._userid, self._userid, profile._userid,
                helper_time.ts_since_1970(10))

            html = self._ha.getstring(url,
                                      headers="""
            accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
            accept-encoding: gzip, deflate
            accept-language: en-US,en;q=0.9
            cache-control: no-cache
            pragma: no-cache
            sec-fetch-mode: navigate
            sec-fetch-site: same-origin
            sec-fetch-user: ?1
            upgrade-insecure-requests: 1
            user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36
            viewport-width: 1600""")

            if html is None:
                return

            m = self._re_info_relation.search(html)
            if m is None:
                self._logger.debug("No relationship info found: {} {}".format(
                    profile.nickname, profile.url))
                return

            strdiv: str = m.group(1).strip()
            m = self._re_relation.search(strdiv)
            if not m is None:
                ul = m.group(1).strip()
                if not ul is None and ul != "":
                    self.__parse_relations(profile, ul)

        except Exception:
            self._logger.error(
                "Get education page failed: username:{} url:{}".format(
                    profile._networkid, profile.url))
Ejemplo n.º 5
0
    def _get_addrinfo(self, profile: NetworkProfile):
        """联系信息"""
        try:
            # education
            # https://www.facebook.com/profile.php?id=100030846743121&sk=about&section=overview&lst=100013325533097%3A100030846743121%3A1568790537

            url: str = "https://www.facebook.com/profile.php?id={}&sk=about&section=contact-info&lst={}%3A{}%3A{}".format(
                profile._userid, self._userid, profile._userid,
                helper_time.ts_since_1970(10))

            html = self._ha.getstring(url,
                                      headers="""
            accept: */*
            accept-encoding: gzip, deflate
            accept-language: en-US,en;q=0.9
            cache-control: no-cache
            content-type: application/x-www-form-urlencoded
            origin: https://www.facebook.com
            pragma: no-cache
            referer: {}
            sec-fetch-mode: cors
            sec-fetch-site: same-origin""".format(profile.url))

            if html is None:
                return

            soup = BeautifulSoup(
                html.replace('<!--', '').replace('-->', ''), 'lxml')
            # photo
            photo = soup.select_one('._11kf.img')
            if photo:
                try:
                    pic_url = photo.attrs['src'].replace('amp;', '')
                    pic = self._ha.get_response_stream(pic_url)
                    profile._profile_pic = helper_str.base64bytes(pic.read())
                except:
                    pass
            codes = soup.select('.hidden_elem code')
            for code in codes:
                str_code = str(code)
                code = BeautifulSoup(str_code, 'lxml')
                if str_code.__contains__('性别') or str_code.__contains__(
                        '出生日期'):
                    sex = re.findall(r'性别.*?class="_2iem">(.*?)</span>',
                                     str_code)
                    if sex:
                        profile.gender = sex[0]

                    birth = re.findall(r'出生日期.*?class="_2iem">(.*?)</span>',
                                       str_code)
                    if birth:
                        profile.birthday = birth[0]

                elif str_code.__contains__('手机'):
                    try:
                        profile.set_phone(
                            code.select_one('[class="_2iem"]').get_text(
                                '--*--').split('--*--')[0].replace(
                                    '.', '').replace('-', '').replace(' ', ''))
                    except:
                        pass
                elif str_code.__contains__('出生日期'):
                    profile.birthday = code.select_one(
                        '[class="_2iem"]').get_text()

        except Exception:
            self._logger.error(
                "Get contact-info page failed: username:{} url:{}".format(
                    profile._networkid, profile.url))
Ejemplo n.º 6
0
    def _get_contacts_sub(self,
                          task: IscoutTask,
                          hostuser: NetworkProfile,
                          reason: str = None,
                          get_profile_pic: bool = False) -> iter:
        """get contacts"""
        try:
            hostuserid: str = hostuser._userid
            username: str = hostuser._networkid
            userurl: str = hostuser.url

            if userurl.__contains__('id='):
                m = FBContacts._re_userid.search(userurl)
                if not m is None:
                    userid = m.group(1).strip()
                    userurl = 'https://www.facebook.com/profile.php?id={}'.format(
                        userid)

            if not self._access_user_home(hostuserid, username, userurl):
                return

            totalcount = 0  #用于先写死只获取前100个好友
            page = 0
            got: bool = False
            curr_page_ct_cnt = 0
            collection_token: str = None
            cursor: str = None
            pagelet_token: str = None
            lst_ts: str = str(helper_time.ts_since_1970(10))
            isfirst: bool = True
            userids: dict = {}
            while True:
                try:

                    url = self._build_contacts_url(hostuserid, username,
                                                   userurl, page,
                                                   collection_token, cursor,
                                                   pagelet_token, lst_ts,
                                                   isfirst)

                    html = self._ha.getstring(url,
                                              headers='''
                    accept: */*
                    accept-encoding: gzip, deflate
                    accept-language: en,zh-CN;q=0.9,zh;q=0.8
                    cache-control: no-cache
                    pragma: no-cache
                    referer: {}
                    user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36'''
                                              .format(userurl))

                    if isfirst:
                        # aria-controls=\"pagelet_timeline_app_collection_100012401268356:2356318349:2\"
                        m = re.search(
                            r'pagelet_timeline_app_collection_([\d:]+?)\\"',
                            html, re.S)
                        if not m is None:
                            # 100012401268356:2356318349:2
                            collection_token = m.group(1).rstrip()
                        # "pagelet_token":"AWv9jIWdg5s52Nof1GKmH2bepQJzpLGdiuqTT299-mCLaJbofF9UT2MQtTo4Dk75kr8"
                        succ, pagelet_token = helper_str.substringif(
                            html, 'pagelet_token":"', '"')

                        # eachtime cursor changes
                        # enableContentLoader",.*?"(\w+?)"]]
                        m = re.search(r'enableContentLoader",.*?,"(.+?)"]]',
                                      html, re.S)
                        if not m is None:
                            cursor = m.group(1).strip()

                        isfirst = False

                    got = False
                    curr_page_ct_cnt = 0
                    for ct in self._parse_contacts(hostuser, html, reason,
                                                   get_profile_pic):

                        if userids.__contains__(ct._userid):
                            continue
                        else:
                            userids[ct._userid] = None
                            curr_page_ct_cnt += 1
                            totalcount += 1

                        if not got:
                            got = True

                        self._logger.info(
                            "Got a contact, hostuser:{}({}), contact:{}({})".
                            format(hostuser.nickname, hostuser._userid,
                                   ct.nickname, ct._userid))

                        yield ct

                        if totalcount >= 100:
                            break

                    if got:
                        self._logger.info(
                            "Get user {} contacts on page {}, {} contacts found"
                            .format(username, page, curr_page_ct_cnt))

                    if not got or curr_page_ct_cnt < 1:
                        self._logger.info(
                            "No contact found on page {} of user {}({})".
                            format(page, hostuser.nickname, hostuser._userid))
                        break
                    if not isfirst and cursor is None:
                        break
                    if totalcount >= 100:
                        break

                except Exception:
                    self._logger.error(
                        "Get contacts ont page {} failed: userid:{} nickname:{} ex:{}"
                        .format(page, hostuserid, username,
                                traceback.format_exc()))
                finally:
                    page += 1
                    time.sleep(0.5)

            self._logger.info("Got {} contacts of user {}({})".format(
                totalcount, hostuser.nickname, hostuser._userid))

        except Exception:
            self._logger.error(
                "Get fb contacts error:\nuserid={}\nusername={}\nerror:{}".
                format(hostuser._userid, hostuser.nickname,
                       traceback.format_exc()))
Ejemplo n.º 7
0
    def _parse_chatlog(self, msg: map, threadtype: str, ownerid: str) -> iter:
        """接收 json解出来的map消息msg对象,返回ICHATLOG_ONE和RESOURCE对象迭代器\n
        msg:json解出来的map消息msg对象\n
        threadtype:一个json中解出来的字段,应该是表示会话类型"""
        try:
            if msg is None:
                self._logger.error(
                    "Invalid msg map object for parseing chat log: {}".format(
                        msg))
                return
            chattype: int = 0  # 0私聊,1群聊
            msgtype: str = None  # 图片视频等
            sendtime: str = None  # 发送时间
            if not msg.__contains__('message_sender') or not msg[
                    'message_sender'].__contains__(
                        'id') or not msg.__contains__('message_id'):
                return
            if not threadtype is None:
                if threadtype != "ONE_TO_ONE":
                    chattype = 1
                else:
                    chattype = 0
            # 消息类型
            if not msg.__contains__('__typename') or msg[
                    '__typename'] is None or msg['__typename'] == '':
                return
            msgtype = self._judge_message_type(msg['__typename'])
            # 发送时间戳
            timestamp_precise = None
            if msg.__contains__('timestamp_precise'):
                try:
                    tmp = msg['timestamp_precise']
                    tmp = int(tmp)
                    timestamp_precise = tmp
                    sendtime = helper_time.timespan_to_datestr(tmp)
                except Exception:
                    sendtime = helper_time.timespan_to_datestr(
                        helper_time.ts_since_1970())

            # 构建消息对象
            ctg = ICHATLOG_ONE(self.task, self._appcfg._apptype, self._userid,
                               msgtype, ownerid, chattype, msg['message_id'],
                               msg['message_sender']['id'], sendtime)
            ctg.remarks = timestamp_precise

            # 已读未读
            if msg.__contains__('unread'):
                if msg['unread'].strip().lower() == 'true':
                    ctg.isread = 0
                else:
                    ctg.isread = 1
            # 表情资源
            if msg.__contains__(
                    'sticker'
            ) and not msg['sticker'] is None and msg['sticker'].__contains__(
                    'url') and msg['sticker'].__contains__('label'):
                sjstk = msg['sticker']
                if sjstk.__contains__('url'):
                    url = sjstk['url'].replace('\\', '').rstrip()
                    rscid = helper_crypto.get_md5_from_str(url)
                    if sjstk.__contains__('id'):
                        rscid = sjstk['id']
                    for rsc in self._fetch_resources(url,
                                                     EResourceType.Picture,
                                                     rscid):
                        ctg.append_resource(rsc)
                        yield rsc
            # 片段,系统消息说明
            if msg.__contains__('snippet'):
                ctg.content += msg['snippet']
            # answered对方是否响应
            if msg.__contains__('answered'):
                if msg['answered'] == 'false':
                    ctg.answered = 0
                else:
                    ctg.answered = 1
            # blob_attachments
            if msg.__contains__('blob_attachments'
                                ) and not msg['blob_attachments'] is None:
                for blob in msg['blob_attachments']:
                    if not blob.__contains__('__typename'):
                        continue
                    # 拿附件url,附件类型/type
                    url, rsctype = self._get_attachments_type_and_url(blob)
                    if not isinstance(url, str) or url == "":
                        self._logger.warn(
                            "Get attachment url failed: {}".format(blob))
                        continue
                    rscid: str = None
                    if blob.__contains__('legacy_attachment_id'):
                        rscid = blob['legacy_attachment_id']
                    elif blob.__contains__('message_file_fbid'):
                        rscid = blob['message_file_fbid']
                    if not isinstance(rscid, str) or rscid == "":
                        rscid = helper_crypto.get_md5_from_str(url)
                    # 附件名
                    finame = None
                    if blob.__contains__('filename'):
                        finame = blob['filename']
                    # 下载
                    for rsc in self._fetch_resources(url, rsctype, rscid,
                                                     finame):
                        ctg.append_resource(rsc)
                        yield rsc
            if msg.__contains__(
                    'extensible_attachment'
            ) and not msg['extensible_attachment'] is None and msg[
                    'extensible_attachment'].__contains__(
                        'legacy_attachment_id'):
                resourceid = msg['extensible_attachment'][
                    'legacy_attachment_id']
                if msg['extensible_attachment'].__contains__("story_attachment") \
                        and msg['extensible_attachment']['story_attachment'].__contains__('media'):
                    jmedia = msg['extensible_attachment']['story_attachment'][
                        'media']

                    if jmedia.__contains__('is_playable') and jmedia[
                            'is_playable'] == 'true' and jmedia.__contains__(
                                'playable_url'):
                        url = jmedia['playable_url'].rstrip().replace(
                            '\\', '').rstrip()
                        for rsc in self._fetch_resources(
                                url, EResourceType.Video, resourceid):
                            ctg.append_resource(rsc)
                            yield rsc

                    if jmedia.__contains__('image') \
                            and jmedia['image'].__contains__('uri'):
                        url = jmedia['image']['uri'].rstrip().replace(
                            '\\', '').rstrip()

                        for rsc in self._fetch_resources(
                                url, EResourceType.Picture, resourceid):
                            ctg.append_resource(rsc)
                            yield rsc
            # message
            if msg.__contains__('message') and not msg['message'] is None:
                if msg['message'].__contains__('text'):
                    if not msg['message']['text'] is None and not msg[
                            'message']['text'] == '':
                        ctg.content += msg['message']['text']

            yield ctg

        except Exception:
            self._logger.error(
                "Parse one chatlog msg error:\nmsg:{}\nerror:{}".format(
                    msg, traceback.format_exc()))