def clean_title_file(path): with decode_open(path, "r", "utf8") as f: for i, l in enumerate(f): if i % 10000 == 0: logging.error("%.4f completed", float(i) / 806792) obj = json.loads(l) words = nltk.word_tokenize(obj[1]) if is_monocase(words): print l,
def get_document_content(path): """ Get the actual content of the document >>> get_document_content("/home/group/puls/Shared/capitalization-recovery/reuters-text/sth.txt") u'something' """ with decode_open(path, "r", "utf8", "ignore") as doc: return doc.read()
def clean_title_file(path): with decode_open(path, "r", "utf8") as f: for i,l in enumerate(f): if i % 10000 == 0: logging.error("%.4f completed", float(i) / 806792) obj = json.loads(l) words = nltk.word_tokenize(obj[1]) if is_monocase(words): print l,
def get_document_content_paf(path): """ Content extractor for PAF file Exclude the title and get the actual content of the document >>> c = get_document_content_paf("/group/home/puls/Shared/capitalization-recovery/30/online.wsj.com.xml.rss.3_7031/3918A8D35025B47AC6A62D293F5F506F") """ _, end = get_title_position(path + ".paf") with decode_open(path, "r", "utf8", "ignore") as doc: content = doc.read() return "".join(content[end:])
def get_title_and_content_by_paf(path): """ Return: the content before the title, the title the body """ start, end = get_title_position(path + ".paf") with decode_open(path, "r", "utf8", "ignore") as doc: content = doc.read() return ("".join(content[:start]), "".join(content[start:end]), "".join(content[end:]))
def get_title_position(path): """ >>> get_title_position("/group/home/puls/Shared/capitalization-recovery/30/online.wsj.com.xml.rss.3_7031/3918A8D35025B47AC6A62D293F5F506F.paf") (42, 77) """ with decode_open(path, "r", "utf8") as paf: for line in paf: match = title_pos_regexp.search(line) if match: # get title index start = int(match.group(1)) end = int(match.group(2)) return start, end raise Exception("Unable to find start and end position for %s" % path)
def get_title_position(path): """ >>> get_title_position("/group/home/puls/Shared/capitalization-recovery/30/online.wsj.com.xml.rss.3_7031/3918A8D35025B47AC6A62D293F5F506F.paf") (42, 77) """ with decode_open(path, "r", "utf8") as paf: for line in paf: match = title_pos_regexp.search(line) if match: # get title index start = int(match.group(1)) end = int(match.group(2)) return start, end raise Exception("Unable to find start and end position for %s" %path)
def save_content(content, original_file_path, target_directory = "/group/home/puls/Shared/capitalization-recovery/reuters-text/"): """ Save the content somewhere, return the saved path >>> save_content("something", "/group/home/puls/Shared/capitalization-recovery/RCV1/REUTERS_CORPUS_1/sth.xml", target_directory = "/group/home/puls/Shared/capitalization-recovery/reuters-text/") '/group/home/puls/Shared/capitalization-recovery/reuters-text/sth.txt' >>> open("/group/home/puls/Shared/capitalization-recovery/reuters-text/sth.txt").read() 'something' """ original_file_name = os.path.basename(original_file_path) file_name = original_file_name.split(".")[0] + ".txt" content_path = os.path.join(target_directory, file_name) with decode_open(content_path, "w", "utf8") as f: f.write(content) return content_path
def extract_title(path): """ Given document file path Extract the title >>> extract_title("/group/home/puls/Shared/capitalization-recovery/12/www.ameinfo.com.rssfeeds.10660/DE01D30EA383DFD9FA1427CB9CC935F2") u'Polaroid launches new range of products at opening day of GITEX Technology Week 2014' >>> extract_title("/group/home/puls/Shared/capitalization-recovery/30/online.wsj.com.xml.rss.3_7031/3918A8D35025B47AC6A62D293F5F506F") u'Bad Bets Rock Fortress\u2019s Macro Fund' >>> extract_title("/group/home/puls/Shared/capitalization-recovery/30/feeds.foxbusiness.com.foxbusiness/E1D1899ED1CDEAB1574C1D279CBA2632") u'Is Gold\u2019s Knockout Punch Coming?' >>> extract_title("/group/home/puls/Shared/capitalization-recovery/30/www.streetinsider.com.freefeed.php/34D4137A7AEB5118C6E9EC451E66B529") u'Solving IT Debuts on Staffing Industry Analysts\u2019 Top 100 Fastest-Growing U.S. Staffing and Talent Engagement Firms' """ start, end = get_title_position(path + ".paf") with decode_open(path, "r", "utf8") as doc: #extract the content content = doc.read() return "".join(content[start:end])
def extract_title(path): """ Given document file path Extract the title >>> extract_title("/group/home/puls/Shared/capitalization-recovery/12/www.ameinfo.com.rssfeeds.10660/DE01D30EA383DFD9FA1427CB9CC935F2") u'Polaroid launches new range of products at opening day of GITEX Technology Week 2014' >>> extract_title("/group/home/puls/Shared/capitalization-recovery/30/online.wsj.com.xml.rss.3_7031/3918A8D35025B47AC6A62D293F5F506F") u'Bad Bets Rock Fortress\u2019s Macro Fund' >>> extract_title("/group/home/puls/Shared/capitalization-recovery/30/feeds.foxbusiness.com.foxbusiness/E1D1899ED1CDEAB1574C1D279CBA2632") u'Is Gold\u2019s Knockout Punch Coming?' >>> extract_title("/group/home/puls/Shared/capitalization-recovery/30/www.streetinsider.com.freefeed.php/34D4137A7AEB5118C6E9EC451E66B529") u'Solving IT Debuts on Staffing Industry Analysts\u2019 Top 100 Fastest-Growing U.S. Staffing and Talent Engagement Firms' """ start, end = get_title_position(path + ".paf") with decode_open(path, "r", "utf8") as doc: #extract the content content = doc.read() return "".join(content[start: end])
def save_content( content, original_file_path, target_directory="/group/home/puls/Shared/capitalization-recovery/reuters-text/" ): """ Save the content somewhere, return the saved path >>> save_content("something", "/group/home/puls/Shared/capitalization-recovery/RCV1/REUTERS_CORPUS_1/sth.xml", target_directory = "/group/home/puls/Shared/capitalization-recovery/reuters-text/") '/group/home/puls/Shared/capitalization-recovery/reuters-text/sth.txt' >>> open("/group/home/puls/Shared/capitalization-recovery/reuters-text/sth.txt").read() 'something' """ original_file_name = os.path.basename(original_file_path) file_name = original_file_name.split(".")[0] + ".txt" content_path = os.path.join(target_directory, file_name) with decode_open(content_path, "w", "utf8") as f: f.write(content) return content_path