Beispiel #1
0
def CreateLiveBlob(file_name):
    with ProfileDb(file_name) as profileDb:
        profileDb.Clear()

        stringMap   =   StringMap()
        progress    =   Progress()
        for section in Progress.SECTIONS:
            profileDb.FillSection(section,progress.getIds(section))
        profileDb.FillStrings("Fetishes",stringMap.getSection("Fetish"))
        pids        =   set(progress.getIds("CompletedProfiles"))
        sys.stderr.write("Profiles to load: [%s]\n" % len(pids))
        ploaded      =   0
        pfailed      =   0
        ptotal       =   len(pids)
        for pid in pids:
            profile =   Profile(pid)
            if(profile.load()):
                profileDb.AddProfile(profile)
                ploaded   += 1
                sys.stderr.write("Progress - Loaded Profile [%12s], [%12s] of [%12s], [%3s%% Done]\n" % (pid,ploaded,ptotal,100*(ploaded+pfailed)/ptotal))
            else:
                progress.errorProfile(pid)
                pfailed  += 1
                sys.stderr.write("Progress - Failed Profile [%12s], [%12s] of [%12s], [%s%% Done]\n" % (pid,pfailed,ptotal,100*(ploaded+pfailed)/ptotal))
            del profile
        gids        =   set(progress.getIds("CompletedGroups"))
        sys.stderr.write("Groups to load: [%s]\n" % len(gids))
        gloaded      =   0
        gfailed      =   0
        gtotal       =   len(gids)
        for gid in gids:
            group =   Group(gid)
            if(group.load()):
                profileDb.AddGroup(group)
                gloaded   += 1
                sys.stderr.write("Progress - Loaded Group [%12s], [%12s] of [%12s], [%3s%% Done]\n" % (gid,gloaded,gtotal,100*(gloaded+gfailed)/gtotal))
            else:
                progress.errorGroup(gid)
                failed  += 1
                sys.stderr.write("Progress - Failed Group [%12s], [%12s] of [%12s], [%s%% Done]\n" % (gid,gfailed,gtotal,100*(gloaded+gfailed)/gtotal))
            del group
 

        sys.stderr.write("Loaded [%d] Profiles [%d] Groups [%d] Errors.\n" % (ploaded,gloaded,pfailed+gfailed))
        return profileDb
Beispiel #2
0
    def fill(self,session):
        sys.stderr.write("Loading Profile [%s]\n" % self.Id)
        assert isinstance(self.Id,numbers.Number)
        self._link    =   "https://fetlife.com/users/%s" % self.Id
        self._page    =   session.get(self._link)
        if self._page.url != self._link:
            sys.stderr.write("Missing Profile [%s]\n" % self.Id)
            return False

        tree    =   html.fromstring(self._page.text)
         
        self.Name    =   tree.xpath('//h2[@class="bottom"]/text()')[0].strip()
        rawPair         =   tree.xpath('//span[@class="small quiet"]/text()')[0].strip()
        splitList       =   re.split(" ",rawPair)
        if len(splitList) > 1:
            self.Type    =   splitList[1]
        try:
            self.Age         =   int(re.sub(r'[^0-9]','', splitList[0]))
        except ValueError:
            self.Age        =   -1
        if self.Age != splitList[0]:
            self.Gender     =   re.sub(r'[0-9 ]','', splitList[0])
        Location            =   tree.xpath('//div[@class="span-13 append-1"]/p/em/a/text()')
        self.Location       =   [unicode(x) for x in Location]
        table               =   tree.xpath('//div[@class="span-13 append-1"]/table/tr')

        for item in table:
            children = [x for x in item]
            header  =   children[0]
            if header.text == "relationship status:" or header.text == "D/s relationship status:":
                assert len(children[1:]) == 1
                td = children[1]
                assert len(td.getchildren()) == 1
                ul = td.getchildren()[0]
                for li in ul:
                    if len(li.getchildren()) == 1:
                        a =  li.getchildren()[0]
                        url =   a.get("href")
                        rel =   li.text.strip()
                        pid =   int(re.sub(r'[^0-9 ]','', url))

                        self.Relationships.append(tuple([pid,rel]))
            elif header.text == "orientation:":
                assert len(children[1:]) == 1
                td = children[1]
                self.Orientation =   td.text
            elif header.text == "active:":
                assert len(children[1:]) == 1
                td = children[1]
                self.Active =   td.text
            elif header.text == "is looking for:":
                assert len(children[1:]) == 1
                td = children[1]
                for text in td.itertext():
                    self.LookingFor.append(text)
            else:
                raise RuntimeError,"Unknown table [%s]" % header.text

        lastActive  =   tree.xpath('//ul[@id="mini_feed"]/li/span[@class="quiet small"]/text()')
        if len(lastActive) != 0:
            self.setLastActive(lastActive[0])

        for groupURL in tree.xpath('//li/a[contains(@href,"/groups/")]/@href'):
            try:
                self.Groups.add(int(re.sub(r'[^0-9]','', groupURL)))
            except ValueError:
                pass
        #---------------------------------------------------
        # Fetishes
        #---------------------------------------------------
        stringMap   =   StringMap()
        stuff       =   tree.xpath('//em[text()="Into:"]/ancestor::p')
        if len(stuff) != 0:
            #sys.stderr.write("Into [%s]\n" % stuff)
            intoList    =   []
            for item in stuff[0]:
                if item.text is None:
                    continue
                #sys.stderr.write("\t[%s][%s]\n" % (item,item.text))
                try:
                    if "href" in item.keys():
                        fetishName          =   item.text
                        fetishId            =   int(re.sub(r'[^0-9 ]','', item.get("href")))
                        intoList.append( [fetishId,None] )
                        if not stringMap.hasString("Fetish",fetishId):
                            stringMap.addString("Fetish",fetishId,fetishName)
                    elif len(intoList) > 0:
                        intoList[-1][1]  =   item.text[1:-1]
                except ValueError:
                    pass

            #sys.stderr.write("\n%s\n" % intoList)
            for (k,v) in intoList:
                if v not in self.Into:
                    self.Into[v] =   set()
                self.Into[v].add(k)

        stuff      =   tree.xpath('//em[text()="Curious about:"]/ancestor::p')
        if len(stuff) != 0:
        #sys.stderr.write("Curious About [%s]\n" % stuff)
            curiousList =   []
            for item in stuff[0]:
                if item.text is None:
                    continue
                #sys.stderr.write("\t[%s][%s] - [%s]\n" % (item,item.text,item.keys()))
                try:
                    if "href" in item.keys():
                        fetishName          =   item.text
                        fetishId            =   int(re.sub(r'[^0-9 ]','', item.get("href")))
                        curiousList.append( [fetishId,None] )
                        if not stringMap.hasString("Fetish",fetishId):
                            stringMap.addString("Fetish",fetishId,fetishName)
                    elif len(curiousList) > 0:
                        curiousList[-1][1]  =   item.text[1:-1]
                except ValueError:
                    pass
            #sys.stderr.write("\n%s\n" % curiousList)
            for (k,v) in curiousList:
                if v not in self.Curious:
                    self.Curious[v] =   set()
                self.Curious[v].add(k)

        #---------------------------------------------------
        # Now, friends
        #---------------------------------------------------
        pageNum =   1
        while True:
            self._link  =   "https://fetlife.com/users/%s/friends?page=%d" % (self.Id,pageNum)
            self._page  =   session.get(self._link)
            tree    =   html.fromstring(self._page.text)
        
            urls =   tree.xpath('//div[@class="clearfix user_in_list"]/div/a/@href')
            for url in urls:
                friend =  int(re.sub(r'[^0-9 ]','', url))
                self.Friends.append(friend)

            next    =   tree.xpath('//a[@class="next_page"]')
            if len(next) == 1:
                pageNum += 1
            else:
                break

        self.setCrawlDate()
        sys.stderr.write("Done Loading Profile [%s]\n" % self.Id)

        return True
Beispiel #3
0
        while not progress.getExit():
            crawler.doTick()
        sys.stderr.write("Ending Crawler\n")
        progress.saveProgress()
        progress.setExit()
    else:
        def RunCrawler(num):
            crawler     =   Crawler(session,progress)
            sys.stderr.write("Starting Crawler [%d]\n" % num)
            while not progress.getExit():
                crawler.doTick()
            sys.stderr.write("Ending Crawler [%d]\n" % num)
            progress.setExit()

        progress    =   Progress()
        stringMap   =   StringMap()
        progress.printProgress()
        threads     =   []
        for i in range(options.threads):
            threads.append(Thread(None,target=RunCrawler,args=(i,)))
            threads[-1].start()
        try:
            while not progress.getExit():
                time.sleep(60)
                progress.printProgress()
                progress.saveProgress()
                stringMap.save()
        except:
            sys.stderr.write("Shutting down from main thread\n")
            progress.setExit()