Ejemplo n.º 1
0
def restaurant_saved_text_preprocess(text):
    """
    Isolate sentences into words and groups them into a single stream.
    """

    text = text.replace(
        "Copyright © 2004-2014 Yelp Inc. Yelp, , and related marks are registered trademarks of Yelp.",
        ""
    ).replace("Was this review ...?", "").replace(
        "This user has arrived from Qype, the newest addition to the Yelp family. The Yelp & Qype engineering team is hard at work integrating the two sites, so stay tuned! Thanks for your patience.",
        "")
    text = re.sub("([.,!?;])([0-9a-zA-Z])", "\g<1> \g<2>",
                  re.sub("([0-9a-zA-Z])([.,!?;])", "\g<1> \g<2>", text))

    words = [word for line in to_raw_text(text) for word in line]
    words = (" ".join(words).lower()).split()
    text = " ".join(words)

    for phrase in ["claim this business", "[ edit ]", "edit business info"]:
        phrase_pos = text.find(phrase)
        if phrase_pos != -1:
            words = text[phrase_pos + len(phrase):].split()
            text = " ".join(words)

    return words
Ejemplo n.º 2
0
def epub2txt(path, extension):
    """ converts an epub to a .txt file using ebooklib, returns the new .txt path"""
    outputPath = path.replace(extension, ".txt")
    from epub_conversion.utils import open_book, convert_epub_to_lines
    from xml_cleaner import to_raw_text
    lines = convert_epub_to_lines(open_book(path))
    for line in lines:
        line = to_raw_text(line,
                           keep_whitespace=True)[0]  # we strip out markup
        if len(line) > 15 and line[
                0] != "<":  # we only keep longer lines to avoid titles and pagination
            line = "".join(line) + "\n"
            with open(outputPath, "a") as f:
                f.write(line)
    return outputPath
Ejemplo n.º 3
0
def restaurant_saved_text_preprocess(text):
    """
    Isolate sentences into words and groups them into a single stream.
    """
    
    text = text.replace("Copyright © 2004-2014 Yelp Inc. Yelp, , and related marks are registered trademarks of Yelp.", "").replace("Was this review ...?", "").replace("This user has arrived from Qype, the newest addition to the Yelp family. The Yelp & Qype engineering team is hard at work integrating the two sites, so stay tuned! Thanks for your patience.", "")
    text = re.sub("([.,!?;])([0-9a-zA-Z])", "\g<1> \g<2>", re.sub("([0-9a-zA-Z])([.,!?;])", "\g<1> \g<2>", text))
    
    words = [word for line in to_raw_text(text) for word in line]
    words = (" ".join(words).lower()).split()
    text = " ".join(words)
    
    for phrase in ["claim this business", "[ edit ]", "edit business info"]:
        phrase_pos = text.find(phrase)
        if phrase_pos != -1:
            words = text[phrase_pos + len(phrase):].split()
            text = " ".join(words)
    
    return words
Ejemplo n.º 4
0
def create_indices(data, mincount=5):
    index2category = []
    category2index = {}
    word2index = {}
    index2word = []
    wordcounts = {}

    for product in data.values():
        for category in product.categories:
            if category not in category2index:
                category2index[category] = len(index2category)
                index2category.append(category)
        
        if not (type(product.description) == list):
            des = []
            for line in to_raw_text(product.description):
                for word in line:
                    des.append(word)
                des.append("\n")
            product.description = des
        tokens = product.description

        for word in tokens:
            if word not in wordcounts:
                wordcounts[word] = 1
            else:
                wordcounts[word] += 1

                
    for word, occurence in wordcounts.items():
        if occurence >= mincount:
            word2index[word] = len(index2word)
            index2word.append(word)
    
    word2index["**UNKNOWN**"] = len(index2word)
    index2word.append("**UNKNOWN**")
    
    word2index["**END**"] = len(index2word)
    index2word.append("**END**")
            
    return index2category, category2index, index2word, word2index
Ejemplo n.º 5
0
def tokenize(text):
    return [word for sentence in to_raw_text(text) for word in sentence]
Ejemplo n.º 6
0
def convert_lines_to_text(lines, article_title):
    for sentence in to_raw_text(lines):
        yield " ".join(sentence) + "\n"
Ejemplo n.º 7
0
def convert_lines_to_text(lines, article_title):
	for sentence in to_raw_text(lines):
		yield " ".join(sentence)+"\n"
Ejemplo n.º 8
0
punctuation = [".", ",", "?", "!", ";", " ", ":", "\n", "’", "-"]

if not os.path.exists(args.inputFile):
    raise SystemError("unable to find file " + args.inputFile)
filenameWithoutExt, extension = os.path.splitext(
    os.path.basename(args.inputFile))
outputFilename = filenameWithoutExt + "_parsed.txt"
if extension.upper() != ".TXT":
    raise SystemError("input file isn't in supported format (.txt)")

with open(args.inputFile, "rt") as inputFile:
    with open(outputFilename, "wt") as outputFile:
        for line in inputFile.readlines():
            if args.strip_xml:
                line = "".join(to_raw_text(line, keep_whitespace=True)[0])
            if args.special_chars:
                line = "".join(
                    [c for c in line if c.isalnum() or c in punctuation])
            if args.numbers:
                line = "".join([c for c in line if not c.isdigit()])
            if args.word_count and len(line.split(" ")) < args.word_count:
                continue
            if args.numbered_lines and line.replace(" ", "")[0].isdigit():
                continue
            if args.markup_lines and line.replace(" ", "")[0] == "<": continue
            outputFile.write(line)

print("successfully parsed", args.inputFile, "to", outputFilename,
      "with options :\n", args)