out_file = open(argv[1]+".raw", "wb") full_text = "" with open(argv[1]) as book_file: for line in book_file: line = line.replace("\n", " ") line = line.replace("\r", "") line = line.split() for word in line: skipword = 0 for letter in word: if ord(letter) > 126: skipword = 1 break if skipword: skipword = 0 continue else: full_text += " " full_text += word sentences = parse_sentences(full_text) for line in sentences: out_file.write(line + "\n") out_file.close()
out_file = open(argv[1] + ".raw", "wb") full_text = "" with open(argv[1]) as book_file: for line in book_file: line = line.replace("\n", " ") line = line.replace("\r", "") line = line.split() for word in line: skipword = 0 for letter in word: if ord(letter) > 126: skipword = 1 break if skipword: skipword = 0 continue else: full_text += " " full_text += word sentences = parse_sentences(full_text) for line in sentences: out_file.write(line + "\n") out_file.close()
time.sleep(5) for thread in threads: if thread in crawled: continue try: threadpage = opener.open("http://boards.4chan.org/"+argv[1]+"/"+ thread).read() except urllib2.HTTPError, err: print "\n***\thttp://boards.4chan.org/"+argv[1]+"/"+ thread + ":\t"+str(err.code) + "\n" crawled.add(thread) continue text = re.findall('"quotelink">>>[0-9]+</a><br>(?!<blockquote>)([^<]+)</blockquote>', threadpage) for line in text: if "only visible to 4chan gold members" in line: continue line = re.sub("([\n\r()]+)","",line) line = line.replace("'","'") line = line.replace(""",'"') if " " in line: # Weed out small posts/replies that won't work with our chain for sentence in parse_sentences(line): print sentence out_file.write(sentence + "\n") crawled.add(thread) time.sleep(5)