def restaurant_saved_text_preprocess(text): """ Isolate sentences into words and groups them into a single stream. """ text = text.replace( "Copyright © 2004-2014 Yelp Inc. Yelp, , and related marks are registered trademarks of Yelp.", "" ).replace("Was this review ...?", "").replace( "This user has arrived from Qype, the newest addition to the Yelp family. The Yelp & Qype engineering team is hard at work integrating the two sites, so stay tuned! Thanks for your patience.", "") text = re.sub("([.,!?;])([0-9a-zA-Z])", "\g<1> \g<2>", re.sub("([0-9a-zA-Z])([.,!?;])", "\g<1> \g<2>", text)) words = [word for line in to_raw_text(text) for word in line] words = (" ".join(words).lower()).split() text = " ".join(words) for phrase in ["claim this business", "[ edit ]", "edit business info"]: phrase_pos = text.find(phrase) if phrase_pos != -1: words = text[phrase_pos + len(phrase):].split() text = " ".join(words) return words
def epub2txt(path, extension): """ converts an epub to a .txt file using ebooklib, returns the new .txt path""" outputPath = path.replace(extension, ".txt") from epub_conversion.utils import open_book, convert_epub_to_lines from xml_cleaner import to_raw_text lines = convert_epub_to_lines(open_book(path)) for line in lines: line = to_raw_text(line, keep_whitespace=True)[0] # we strip out markup if len(line) > 15 and line[ 0] != "<": # we only keep longer lines to avoid titles and pagination line = "".join(line) + "\n" with open(outputPath, "a") as f: f.write(line) return outputPath
def restaurant_saved_text_preprocess(text): """ Isolate sentences into words and groups them into a single stream. """ text = text.replace("Copyright © 2004-2014 Yelp Inc. Yelp, , and related marks are registered trademarks of Yelp.", "").replace("Was this review ...?", "").replace("This user has arrived from Qype, the newest addition to the Yelp family. The Yelp & Qype engineering team is hard at work integrating the two sites, so stay tuned! Thanks for your patience.", "") text = re.sub("([.,!?;])([0-9a-zA-Z])", "\g<1> \g<2>", re.sub("([0-9a-zA-Z])([.,!?;])", "\g<1> \g<2>", text)) words = [word for line in to_raw_text(text) for word in line] words = (" ".join(words).lower()).split() text = " ".join(words) for phrase in ["claim this business", "[ edit ]", "edit business info"]: phrase_pos = text.find(phrase) if phrase_pos != -1: words = text[phrase_pos + len(phrase):].split() text = " ".join(words) return words
def create_indices(data, mincount=5): index2category = [] category2index = {} word2index = {} index2word = [] wordcounts = {} for product in data.values(): for category in product.categories: if category not in category2index: category2index[category] = len(index2category) index2category.append(category) if not (type(product.description) == list): des = [] for line in to_raw_text(product.description): for word in line: des.append(word) des.append("\n") product.description = des tokens = product.description for word in tokens: if word not in wordcounts: wordcounts[word] = 1 else: wordcounts[word] += 1 for word, occurence in wordcounts.items(): if occurence >= mincount: word2index[word] = len(index2word) index2word.append(word) word2index["**UNKNOWN**"] = len(index2word) index2word.append("**UNKNOWN**") word2index["**END**"] = len(index2word) index2word.append("**END**") return index2category, category2index, index2word, word2index
def tokenize(text): return [word for sentence in to_raw_text(text) for word in sentence]
def convert_lines_to_text(lines, article_title): for sentence in to_raw_text(lines): yield " ".join(sentence) + "\n"
def convert_lines_to_text(lines, article_title): for sentence in to_raw_text(lines): yield " ".join(sentence)+"\n"
punctuation = [".", ",", "?", "!", ";", " ", ":", "\n", "’", "-"] if not os.path.exists(args.inputFile): raise SystemError("unable to find file " + args.inputFile) filenameWithoutExt, extension = os.path.splitext( os.path.basename(args.inputFile)) outputFilename = filenameWithoutExt + "_parsed.txt" if extension.upper() != ".TXT": raise SystemError("input file isn't in supported format (.txt)") with open(args.inputFile, "rt") as inputFile: with open(outputFilename, "wt") as outputFile: for line in inputFile.readlines(): if args.strip_xml: line = "".join(to_raw_text(line, keep_whitespace=True)[0]) if args.special_chars: line = "".join( [c for c in line if c.isalnum() or c in punctuation]) if args.numbers: line = "".join([c for c in line if not c.isdigit()]) if args.word_count and len(line.split(" ")) < args.word_count: continue if args.numbered_lines and line.replace(" ", "")[0].isdigit(): continue if args.markup_lines and line.replace(" ", "")[0] == "<": continue outputFile.write(line) print("successfully parsed", args.inputFile, "to", outputFilename, "with options :\n", args)