resp = session.get(url)
    logger.info('Parsing content')
    parsed_html = bs4.BeautifulSoup(resp.text, 'html.parser')
    # Note: There should be only one element called div.desc
    # but there's no guarantee
    pages_text = parsed_html.find('div', class_='desc').get_text()
    config['num_pages'] = int(re.search('Page 1 of ([0-9]+)', pages_text).group(1))
    print 'Found {0} pages'.format(config['num_pages'])
    if args.start > config['num_pages']:
        print 'Start page', args.start, 'is greater than found pages:', config['num_pages']
        print 'Setting start pages to last page'
        args.start = config['num_pages']
    imdb_all = []
    username = os.path.splitext(os.path.basename(args.outfile))[0]
    with codecs.open(args.outfile, 'wb') as outfile:
        w = UnicodeWriter(outfile)
        # Only output header if file didn't exist
        w.writerow(['position','const','created','modified','description','Title','Title type','Directors',
                    '{0} rated'.format(username),'IMDb Rating','Runtime (mins)','Year','Genres','Num. Votes',
                    'Release Date (month/day/year)','URL'])
    for page in get_start_positions(config['num_pages'], args.start):
        pool.spawn(download_page, page[0], page[1])
    pool.join()
    with codecs.open(args.outfile, 'ab') as outfile:
        w = UnicodeWriter(outfile)
        w.writerows(imdb_all)
    end_time = time.time()
    print 'Downloaded', len(imdb_all), 'ratings in', pretty_seconds(end_time - start_time)
    logger.info('Downloaded %s ratings in %s', len(imdb_all), pretty_seconds(end_time - start_time))
    print 'Saved results in', args.outfile
Example #2
0
    def handle(self):
        """
        Reply to incoming requests.

        Request:
            <COMMAND> <FILTER-TYPE> [SUBSTRING ...]
            COMMAND:     ALL|OPEN|INGAME
            FILTER-TYPE: NONE|MOD|HOST|DESC
            SUBSTRING:   The text to look for in the column FILTER-TYPE. If
                         space[s] is encountered, each word must be in the
                         field (AND). If '|' is encountered, word[s] before
                         and after it will be searched for separately and
                         all results will be returned (OR).
        Reply:
            1st line: 'START <ISO 8601 timestamp, UTC>'
            2nd: List of hosts as an UTF-8 encoded CSV using ; as separator and
               quoting every field. The list will be filtered if
               FILTER-TYPE != NONE.
            3rd: 'END <length of list>'
        """
        try:
            for line in self.rfile:
                # loop until disconnect or server shutdown
                if self.server.shutdown_now:
                    logger.info("(%s:%d) server shut down already, bye bye",
                                self.client_address[0], self.client_address[1])
                    self.finish()
                    return
                # remote sockets are not always closed, kill myself after MAX_CONNECTION_LENGTH seconds
                if datetime.datetime.now(
                ) - self.thread.start_time > datetime.timedelta(
                        seconds=MAX_CONNECTION_LENGTH):
                    logger.info(
                        "(%s:%d) Running since %s (>%d sec) in thread %s, killing myself.",
                        self.client_address[0], self.client_address[1],
                        self.thread.start_time.strftime("%Y-%m-%d %H:%M:%S"),
                        MAX_CONNECTION_LENGTH, self.thread.name)
                    self.finish()
                    return

                self.server.query_stats_add(line)
                line = line.split()
                if len(line) < 2 or (len(line) == 2 and line[1] != "NONE"):
                    logger.error("(%s:%d) Format error: '%s'",
                                 self.client_address[0],
                                 self.client_address[1], line)
                    continue
                # COMMAND
                if line[0] == "ALL":
                    host_list = self.hosts.values()
                elif line[0] == "OPEN":
                    host_list = self.hosts_open.values()
                elif line[0] == "INGAME":
                    host_list = self.hosts_ingame.values()
                else:
                    logger.error("(%s:%d) Unknown COMMAND '%s'.",
                                 self.client_address[0],
                                 self.client_address[1], line[0])
                    continue
                # FILTER-TYPE
                if line[1] == "NONE":
                    host_list_filtered = host_list
                elif line[1] == "MOD":
                    host_list_filtered = list()
                    for words in " ".join(line[2:]).split("|"):
                        host_list_filtered.extend([
                            host for host in host_list
                            if substr_search(words, host.gameName)
                        ])
                elif line[1] == "HOST":
                    host_list_filtered = list()
                    for words in " ".join(line[2:]).split("|"):
                        host_list_filtered.extend([
                            host for host in host_list
                            if substr_search(words, host.founder)
                        ])
                else:
                    logger.error("(%s:%d) Unknown FILTER-TYPE '%s'.",
                                 self.client_address[0],
                                 self.client_address[1], line[0])
                    continue

                response = u"START %s\n" % datetime.datetime.utcnow(
                ).isoformat()
                if len(host_list_filtered) > 0:
                    csvfile = cStringIO.StringIO()
                    csvwriter = UnicodeWriter(csvfile, quoting=csv.QUOTE_ALL)
                    csvwriter.writerow(host_list_filtered[0].as_list_header())
                    csvwriter.writerows(
                        [host.as_list() for host in host_list_filtered])
                    response += csvfile.getvalue()
                    csvfile.close()
                response += u"END %d\n" % len(host_list_filtered)
                self.wfile.write(response)
        except socket.error, so:
            # client disconnected. that's OK, thread will terminate now
            logger.debug(
                "(%s:%d) client disconnected after %0.1f min",
                self.client_address[0], self.client_address[1],
                (datetime.datetime.now() - self.thread.start_time).seconds /
                60.0)
            self.finish()
            return
Example #3
0
    def handle(self):
        """
        Reply to incoming requests.

        Request:
            <COMMAND> <FILTER-TYPE> [SUBSTRING ...]
            COMMAND:     ALL|OPEN|INGAME
            FILTER-TYPE: NONE|MOD|HOST|DESC
            SUBSTRING:   The text to look for in the column FILTER-TYPE. If
                         space[s] is encountered, each word must be in the
                         field (AND). If '|' is encountered, word[s] before
                         and after it will be searched for separately and
                         all results will be returned (OR).
        Reply:
            1st line: 'START <ISO 8601 timestamp, UTC>'
            2nd: List of hosts as an UTF-8 encoded CSV using ; as separator and
               quoting every field. The list will be filtered if
               FILTER-TYPE != NONE.
            3rd: 'END <length of list>'
        """
        try:
            for line in self.rfile:
                # loop until disconnect or server shutdown
                if self.server.shutdown_now:
                    logger.info("(%s:%d) server shut down already, bye bye", self.client_address[0],
                                self.client_address[1])
                    self.finish()
                    return
                # remote sockets are not always closed, kill myself after MAX_CONNECTION_LENGTH seconds
                if datetime.datetime.now() - self.thread.start_time > datetime.timedelta(seconds=MAX_CONNECTION_LENGTH):
                    logger.info("(%s:%d) Running since %s (>%d sec) in thread %s, killing myself.",
                                self.client_address[0], self.client_address[1],
                                self.thread.start_time.strftime("%Y-%m-%d %H:%M:%S"), MAX_CONNECTION_LENGTH,
                                self.thread.name)
                    self.finish()
                    return

                self.server.query_stats_add(line)
                line = line.split()
                if len(line) < 2 or (len(line) == 2 and line[1] != "NONE"):
                    logger.error("(%s:%d) Format error: '%s'", self.client_address[0], self.client_address[1], line)
                    continue
                # COMMAND
                if line[0] == "ALL":
                    host_list = self.hosts.values()
                elif line[0] == "OPEN":
                    host_list = self.hosts_open.values()
                elif line[0] == "INGAME":
                    host_list = self.hosts_ingame.values()
                else:
                    logger.error("(%s:%d) Unknown COMMAND '%s'.", self.client_address[0], self.client_address[1], line[0])
                    continue
                # FILTER-TYPE
                if line[1] == "NONE":
                    host_list_filtered = host_list
                elif line[1] == "MOD":
                    host_list_filtered = list()
                    for words in " ".join(line[2:]).split("|"):
                        host_list_filtered.extend([host for host in host_list if substr_search(words, host.gameName)])
                elif line[1] == "HOST":
                    host_list_filtered = list()
                    for words in " ".join(line[2:]).split("|"):
                        host_list_filtered.extend([host for host in host_list if substr_search(words, host.founder)])
                else:
                    logger.error("(%s:%d) Unknown FILTER-TYPE '%s'.", self.client_address[0], self.client_address[1],
                                 line[0])
                    continue

                response = u"START %s\n" % datetime.datetime.utcnow().isoformat()
                if len(host_list_filtered) > 0:
                    csvfile = cStringIO.StringIO()
                    csvwriter = UnicodeWriter(csvfile, quoting=csv.QUOTE_ALL)
                    csvwriter.writerow(host_list_filtered[0].as_list_header())
                    csvwriter.writerows([host.as_list() for host in host_list_filtered])
                    response += csvfile.getvalue()
                    csvfile.close()
                response += u"END %d\n" % len(host_list_filtered)
                self.wfile.write(response)
        except socket.error, so:
            # client disconnected. that's OK, thread will terminate now
            logger.debug("(%s:%d) client disconnected after %0.1f min", self.client_address[0], self.client_address[1],
                (datetime.datetime.now() - self.thread.start_time).seconds/60.0)
            self.finish()
            return