Ejemplo n.º 1
0
out_file = open(argv[1]+".raw", "wb")

full_text = ""

with open(argv[1]) as book_file:
    for line in book_file:
        line = line.replace("\n", " ")
        line = line.replace("\r", "")
        line = line.split()

        for word in line:
            skipword = 0
            for letter in word:
                if ord(letter) > 126:
                    skipword = 1
                    break
            if skipword:
                skipword = 0
                continue
            else:
                full_text += " "
                full_text += word

sentences = parse_sentences(full_text)

for line in sentences:
    out_file.write(line + "\n")


out_file.close()
Ejemplo n.º 2
0
out_file = open(argv[1] + ".raw", "wb")

full_text = ""

with open(argv[1]) as book_file:
    for line in book_file:
        line = line.replace("\n", " ")
        line = line.replace("\r", "")
        line = line.split()

        for word in line:
            skipword = 0
            for letter in word:
                if ord(letter) > 126:
                    skipword = 1
                    break
            if skipword:
                skipword = 0
                continue
            else:
                full_text += " "
                full_text += word

sentences = parse_sentences(full_text)

for line in sentences:
    out_file.write(line + "\n")

out_file.close()
Ejemplo n.º 3
0
        time.sleep(5)


    for thread in threads:
        if thread in crawled:
            continue

        try:        
            threadpage = opener.open("http://boards.4chan.org/"+argv[1]+"/"+ thread).read()
        except urllib2.HTTPError, err:
            print "\n***\thttp://boards.4chan.org/"+argv[1]+"/"+ thread + ":\t"+str(err.code) + "\n"
            crawled.add(thread)
            continue

        text = re.findall('"quotelink">&gt;&gt;[0-9]+</a><br>(?!<blockquote>)([^<]+)</blockquote>', threadpage)

        for line in text:
            if "only visible to 4chan gold members" in line:
                continue
            line = re.sub("([\n\r()]+)","",line)
            line = line.replace("&#039;","'")
            line = line.replace("&quot;",'"')

            if " " in line: # Weed out small posts/replies that won't work with our chain
                for sentence in parse_sentences(line):
                    print sentence
                    out_file.write(sentence + "\n")

        crawled.add(thread)
        time.sleep(5)
Ejemplo n.º 4
0
        time.sleep(5)


    for thread in threads:
        if thread in crawled:
            continue

        try:        
            threadpage = opener.open("http://boards.4chan.org/"+argv[1]+"/"+ thread).read()
        except urllib2.HTTPError, err:
            print "\n***\thttp://boards.4chan.org/"+argv[1]+"/"+ thread + ":\t"+str(err.code) + "\n"
            crawled.add(thread)
            continue

        text = re.findall('"quotelink">&gt;&gt;[0-9]+</a><br>(?!<blockquote>)([^<]+)</blockquote>', threadpage)

        for line in text:
            if "only visible to 4chan gold members" in line:
                continue
            line = re.sub("([\n\r()]+)","",line)
            line = line.replace("&#039;","'")
            line = line.replace("&quot;",'"')

            if " " in line: # Weed out small posts/replies that won't work with our chain
                for sentence in parse_sentences(line):
                    print sentence
                    out_file.write(sentence + "\n")

        crawled.add(thread)
        time.sleep(5)