Beispiel #1
0
from newsflow.config import conf, logger
from newsflow.nntp import NNTPConnection

import simplejson as json
import yenc

from os import unlink, makedirs, stat
from traceback import format_exc
from time import time
import os.path
import sys
import re

log = logger('newsflow.consumer')


class Consumer(NNTPConnection):
    def __init__(self, host, username, password, database, port=119):
        NNTPConnection.__init__(self, host, port, reconnect_timeout=conf('scraper.reconnect_timeout'))
        self.username = username
        self.password = password
        self.db = database

    def consume(self):
        pass

    def get_tmpdir(self, group):
        tmpdir = '%s/%s/' % (conf('scraper.tmp_path'), group)
        try:
            makedirs(tmpdir)
        except:
Beispiel #2
0
from newsflow.config import logger, conf
from time import sleep
import socket

log = logger('newsflow.nntp')

class NNTPConnection(object):
    def __init__(self, host, port, reconnect_timeout=300):
        self.host = host
        self.port = port
        self.reconnect_timeout = reconnect_timeout
        self.connect()

    def connect(self):
        addrinfo = socket.getaddrinfo(self.host, self.port,
            socket.AF_UNSPEC, socket.SOCK_STREAM, 0, socket.AI_PASSIVE)
        for af, socktype, proto, canonical, addr in addrinfo:
            log.debug('Attempting to connect to %s' % addr[0])
            try:
                self.sock = socket.socket(af, socktype, proto)
            except socket.error:
                self.sock = None
                continue
            try:
                self.sock.connect(addr)
            except socket.error:
                self.sock.close()
                self.sock = None
                continue
        if self.sock is None:
            log.error('Unable to connect to %s:%i' % (self.host, self.port))
Beispiel #3
0
from newsflow.config import conf, logger
from newsflow.nntp import NNTPConnection
import re

log = logger('newsflow.scraper')


class Scraper(NNTPConnection):
    def __init__(self, host, username, password, database, group, port=119):
        NNTPConnection.__init__(self, host, port, reconnect_timeout=conf('scraper.reconnect_timeout'))
        self.username = username
        self.password = password
        self.db = database
        self.group = group

    def get_subjects(self):
        lastid = self.db.hget('lastid', self.group)
        if not lastid:
            lastid = conf('scraper.firstid')
        if not lastid:
            log.warning('lastid not found in db and firstid missing from config file, starting at post id 0')
            lastid = '0'
        self.send('XHDR Subject %s-' % lastid)

        line = self.readline()
        if not line.startswith('221'):
            log.error(line)

        log.info('Started downloading headers for ' + self.group)
        while True:
            line = self.readline()