Ejemplo n.º 1
0
def process_chunk(l0, l1, group, group_id,chunk,chunks):
	s=''
	try:
		s = NNTP_SSL('secure.news.astraweb.com', 443, 'nemik1', 'this1moois')
	except:
		return ("fail", 0)
	resp, count, first, last, name = s.group(group)
	
	print "\n------NOW PROCESSING CHUNK %d of %d (%d/%d) FROM %s-----\n\n" % (int(chunk),int(chunks),int(l0),int(l1),group)
	syslog.syslog("NOW PROCESSING CHUNK %d of %d (%d/%d) FROM %s" % (int(chunk),int(chunks),int(l0),int(l1),group))
	
	docs_xml = ""
	resp, subs = s.xover(str(l0), str(l1))
	
	#dictionary of files
	#key is file name which is subject_file
	#value is tuple of poster->String, date->Int(timestamp), subject_exact->String, total parts, list [] segments -> (), segment is tuple of (id, segment, size)
	files = {}
	id = 0
	for sub in subs: 
		size = sub[6]
		subject_exact = decode(sub[1])#.encode("utf-8").decode("utf-8")
		#maybe do this by looking for the pattern everywhere and taking the last one from the group, not just end of line? may files appearing as articles....
		subject_file = re.sub("\((\d+)\/(\d+)\)$", "", subject_exact)
		p = re.search("\((\d+)\/(\d+)\)$",subject_exact)
		#data PartInfo = Part (Int, Int)
		
		id = sub[0]
		
		poster = decode(sub[2])
		date_str = sub[3]
	
		#subject_file_id = query for it
		#if None subject_file_id insert into files

		#d = "Fri, 03 Jul 2009 19:31:58 +0200"
		date = int(time.mktime(rfc822.parsedate(date_str)))

		fid = 0
		
		#part is tuple w/ ints (part #, total)
		part=()
		try:
			part = (int(p.group(1)), int(p.group(2)))
		except:
			c.execute("SELECT id FROM articles WHERE id = %s", (sub[4],))
			aid = c.fetchone()
			if aid != None:
				continue
			else:
				try:
					print "ARTICLE: : %s" % subject_exact
				except:
					print "ARTICLE: f**k you print-ascii"
				#c.execute("INSERT INTO articles (id, group_id, subject, poster, date) VALUES (%s, %s, %s, %s, %s)", (sub[4], group_id, subject_exact, poster, date,))
				#c.execute("INSERT INTO articles (id, group_id, subject, poster, date, time) VALUES (%s, %s, %s, %s, %s, current_timestamp)", (sub[4], group_id, subject_exact, poster, date,))
				c.execute("INSERT INTO articles (id, group_id, subject, poster, date, time) VALUES (%s, %s, %s, %s, %s, (TIMESTAMP 'epoch' + %s * INTERVAL '1 second'))", (sub[4], group_id, subject_exact, poster, date,date,))
				connection.commit()
				continue
		
		#guaranteed the file will be in the DB
		c.execute("SELECT id FROM files WHERE subject = %s", (subject_file))
		subject_file_id = c.fetchone()
		if subject_file_id != None:
			subject_file_id = int(subject_file_id[0])
			fid = subject_file_id
		else:
			try:
				print "processing %s" % subject_file
			except:
				print "processing SOMETHING CAN'T BE PRINTED, f**k you ascii"
			#c.execute("INSERT INTO files (subject, name, poster, parts_total, date) VALUES (%s, %s, %s, %s, %s)", (subject_file, subject_file, poster, part[1], date))
			#c.execute("INSERT INTO files (subject, name, poster, parts_total, date, time) VALUES (%s, %s, %s, %s, %s, current_timestamp)", (subject_file, subject_file, poster, part[1], date))
			c.execute("INSERT INTO files (subject, name, poster, parts_total, date, time) VALUES (%s, %s, %s, %s, %s, (TIMESTAMP 'epoch' + %s * INTERVAL '1 second'))", (subject_file, subject_file, poster, part[1], date,date))
			connection.commit()

			headers = s.head(id)
			groups_posted = ""
			for header in headers[3]:
				if "Newsgroups: " in header:
					groups_posted = header.replace('Newsgroups: ','').split(',')
	
			if fid == 0:
				c.execute("SELECT id FROM files WHERE name = %s", (subject_file,))
				fid = int(c.fetchone()[0])
			#guarantee that group exists otherwise create it
			gid = 0
			for group in groups_posted:
				c.execute("SELECT id FROM groups WHERE name = %s", (group,))
				gid = c.fetchone()
				#if i dont have that group, add it
				if gid != None:
					gid = int(gid[0])
				else:
					c.execute("INSERT INTO groups (name, last_article) VALUES(%s, %s)", (group, 0,))
					connection.commit()
				if gid == 0:
					c.execute("SELECT id FROM groups WHERE name = %s", (group,))
					gid = int(c.fetchone())
				c.execute("SELECT id FROM file_group WHERE file_id = %s AND group_id = %s", (fid, gid,))
				exists = c.fetchone()
				if exists != None:
					continue
				else:
					c.execute("INSERT INTO file_group (file_id, group_id) VALUES (%s, %s)", (fid, gid,))
					connection.commit()

		#print "sub is %s, groups are %s" % (sub, groups_posted)
		#add part to parts table and associate with file
		try:
			c.execute("INSERT INTO parts (id, file_id, bytes, number) VALUES (%s, %s, %s, %s)", (sub[4], fid, size, part[0]))
			connection.commit()
		except:
			continue
	
	firprint = str("first article:"+subs[1][0]+"last article:"+subs[-1][0]+"id is "+id)
	print firprint
	syslog.syslog(firprint)
	c.execute("UPDATE groups SET last_article = %s WHERE name = %s", (subs[-1][0], group,))
	connection.commit()
	
	print "\n\n ---- STATUS OF %s UDPATED! ---- \n\n" % (group)
	syslog.syslog("STATUS OF %s UDPATED!" % (group))
	
	s.quit()
	return (docs_xml,subs[-1][0])
Ejemplo n.º 2
0
def process_chunk(l0, l1, group, group_id, chunk, chunks):
    s = ""
    try:
        s = NNTP_SSL("secure.news.astraweb.com", 443, "nemik1", "this1moois")
    except:
        return ("fail", 0)
    resp, count, first, last, name = s.group(group)

    print "\n------NOW PROCESSING CHUNK %d of %d (%d/%d) FROM %s-----\n\n" % (
        int(chunk),
        int(chunks),
        int(l0),
        int(l1),
        group,
    )
    syslog.syslog("NOW PROCESSING CHUNK %d of %d (%d/%d) FROM %s" % (int(chunk), int(chunks), int(l0), int(l1), group))

    docs_xml = ""
    resp, subs = s.xover(str(l0), str(l1))

    # dictionary of files
    # key is file name which is subject_file
    # value is tuple of poster->String, date->Int(timestamp), subject_exact->String, total parts, list [] segments -> (), segment is tuple of (id, segment, size)
    files = {}
    id = 0
    for sub in subs:
        size = sub[6]
        subject_exact = decode(sub[1])  # .encode("utf-8").decode("utf-8")
        # maybe do this by looking for the pattern everywhere and taking the last one from the group, not just end of line? may files appearing as articles....
        subject_file = re.sub("\((\d+)\/(\d+)\)$", "", subject_exact)
        p = re.search("\((\d+)\/(\d+)\)$", subject_exact)
        # data PartInfo = Part (Int, Int)

        id = sub[0]

        poster = decode(sub[2])
        date_str = sub[3]

        # subject_file_id = query for it
        # if None subject_file_id insert into files

        # d = "Fri, 03 Jul 2009 19:31:58 +0200"
        date = int(time.mktime(rfc822.parsedate(date_str)))

        fid = 0

        # part is tuple w/ ints (part #, total)
        part = ()
        try:
            part = (int(p.group(1)), int(p.group(2)))
        except:
            c.execute("SELECT id FROM articles WHERE id = %s", (sub[4],))
            aid = c.fetchone()
            if aid != None:
                continue
            else:
                try:
                    print "ARTICLE: : %s" % subject_exact
                except:
                    print "ARTICLE: f**k you print-ascii"
                    # c.execute("INSERT INTO articles (id, group_id, subject, poster, date) VALUES (%s, %s, %s, %s, %s)", (sub[4], group_id, subject_exact, poster, date,))
                    # c.execute("INSERT INTO articles (id, group_id, subject, poster, date, time) VALUES (%s, %s, %s, %s, %s, current_timestamp)", (sub[4], group_id, subject_exact, poster, date,))
                c.execute(
                    "INSERT INTO articles (id, group_id, subject, poster, date, time) VALUES (%s, %s, %s, %s, %s, (TIMESTAMP 'epoch' + %s * INTERVAL '1 second'))",
                    (sub[4], group_id, subject_exact, poster, date, date),
                )
                connection.commit()
                continue

                # guaranteed the file will be in the DB
        c.execute("SELECT id FROM files WHERE subject = %s", (subject_file))
        subject_file_id = c.fetchone()
        if subject_file_id != None:
            subject_file_id = int(subject_file_id[0])
            fid = subject_file_id
        else:
            try:
                print "processing %s" % subject_file
            except:
                print "processing SOMETHING CAN'T BE PRINTED, f**k you ascii"
                # c.execute("INSERT INTO files (subject, name, poster, parts_total, date) VALUES (%s, %s, %s, %s, %s)", (subject_file, subject_file, poster, part[1], date))
                # c.execute("INSERT INTO files (subject, name, poster, parts_total, date, time) VALUES (%s, %s, %s, %s, %s, current_timestamp)", (subject_file, subject_file, poster, part[1], date))
            c.execute(
                "INSERT INTO files (subject, name, poster, parts_total, date, time) VALUES (%s, %s, %s, %s, %s, (TIMESTAMP 'epoch' + %s * INTERVAL '1 second'))",
                (subject_file, subject_file, poster, part[1], date, date),
            )
            connection.commit()

            headers = s.head(id)
            groups_posted = ""
            for header in headers[3]:
                if "Newsgroups: " in header:
                    groups_posted = header.replace("Newsgroups: ", "").split(",")

            if fid == 0:
                c.execute("SELECT id FROM files WHERE name = %s", (subject_file,))
                fid = int(c.fetchone()[0])
                # guarantee that group exists otherwise create it
            gid = 0
            for group in groups_posted:
                c.execute("SELECT id FROM groups WHERE name = %s", (group,))
                gid = c.fetchone()
                # if i dont have that group, add it
                if gid != None:
                    gid = int(gid[0])
                else:
                    c.execute("INSERT INTO groups (name, last_article) VALUES(%s, %s)", (group, 0))
                    connection.commit()
                if gid == 0:
                    c.execute("SELECT id FROM groups WHERE name = %s", (group,))
                    gid = int(c.fetchone())
                c.execute("SELECT id FROM file_group WHERE file_id = %s AND group_id = %s", (fid, gid))
                exists = c.fetchone()
                if exists != None:
                    continue
                else:
                    c.execute("INSERT INTO file_group (file_id, group_id) VALUES (%s, %s)", (fid, gid))
                    connection.commit()

                    # print "sub is %s, groups are %s" % (sub, groups_posted)
                    # add part to parts table and associate with file
        try:
            c.execute(
                "INSERT INTO parts (id, file_id, bytes, number) VALUES (%s, %s, %s, %s)", (sub[4], fid, size, part[0])
            )
            connection.commit()
        except:
            continue

    firprint = str("first article:" + subs[1][0] + "last article:" + subs[-1][0] + "id is " + id)
    print firprint
    syslog.syslog(firprint)
    c.execute("UPDATE groups SET last_article = %s WHERE name = %s", (subs[-1][0], group))
    connection.commit()

    print "\n\n ---- STATUS OF %s UDPATED! ---- \n\n" % (group)
    syslog.syslog("STATUS OF %s UDPATED!" % (group))

    s.quit()
    return (docs_xml, subs[-1][0])