from newsflow.config import conf, logger from newsflow.nntp import NNTPConnection import simplejson as json import yenc from os import unlink, makedirs, stat from traceback import format_exc from time import time import os.path import sys import re log = logger('newsflow.consumer') class Consumer(NNTPConnection): def __init__(self, host, username, password, database, port=119): NNTPConnection.__init__(self, host, port, reconnect_timeout=conf('scraper.reconnect_timeout')) self.username = username self.password = password self.db = database def consume(self): pass def get_tmpdir(self, group): tmpdir = '%s/%s/' % (conf('scraper.tmp_path'), group) try: makedirs(tmpdir) except:
from newsflow.config import logger, conf from time import sleep import socket log = logger('newsflow.nntp') class NNTPConnection(object): def __init__(self, host, port, reconnect_timeout=300): self.host = host self.port = port self.reconnect_timeout = reconnect_timeout self.connect() def connect(self): addrinfo = socket.getaddrinfo(self.host, self.port, socket.AF_UNSPEC, socket.SOCK_STREAM, 0, socket.AI_PASSIVE) for af, socktype, proto, canonical, addr in addrinfo: log.debug('Attempting to connect to %s' % addr[0]) try: self.sock = socket.socket(af, socktype, proto) except socket.error: self.sock = None continue try: self.sock.connect(addr) except socket.error: self.sock.close() self.sock = None continue if self.sock is None: log.error('Unable to connect to %s:%i' % (self.host, self.port))
from newsflow.config import conf, logger from newsflow.nntp import NNTPConnection import re log = logger('newsflow.scraper') class Scraper(NNTPConnection): def __init__(self, host, username, password, database, group, port=119): NNTPConnection.__init__(self, host, port, reconnect_timeout=conf('scraper.reconnect_timeout')) self.username = username self.password = password self.db = database self.group = group def get_subjects(self): lastid = self.db.hget('lastid', self.group) if not lastid: lastid = conf('scraper.firstid') if not lastid: log.warning('lastid not found in db and firstid missing from config file, starting at post id 0') lastid = '0' self.send('XHDR Subject %s-' % lastid) line = self.readline() if not line.startswith('221'): log.error(line) log.info('Started downloading headers for ' + self.group) while True: line = self.readline()