Beispiel #1
0
def clean_title_file(path):
    with decode_open(path, "r", "utf8") as f:
        for i, l in enumerate(f):
            if i % 10000 == 0:
                logging.error("%.4f completed", float(i) / 806792)
            obj = json.loads(l)
            words = nltk.word_tokenize(obj[1])
            if is_monocase(words):
                print l,
Beispiel #2
0
def get_document_content(path):
    """
    Get the actual content of the document

    >>> get_document_content("/home/group/puls/Shared/capitalization-recovery/reuters-text/sth.txt")
    u'something'
    """
    with decode_open(path, "r", "utf8", "ignore") as doc:
        return doc.read()
def clean_title_file(path):
    with decode_open(path, "r", "utf8") as f:
        for i,l in enumerate(f):
            if i % 10000 == 0:
                logging.error("%.4f completed", float(i) / 806792)
            obj = json.loads(l)
            words = nltk.word_tokenize(obj[1])
            if is_monocase(words):
                print l, 
def get_document_content(path):
    """
    Get the actual content of the document

    >>> get_document_content("/home/group/puls/Shared/capitalization-recovery/reuters-text/sth.txt")
    u'something'
    """
    with decode_open(path, "r", "utf8", "ignore") as doc:
        return doc.read()
Beispiel #5
0
def get_document_content_paf(path):
    """
    Content extractor for PAF file
    Exclude the title and get the actual content of the document

    >>> c = get_document_content_paf("/group/home/puls/Shared/capitalization-recovery/30/online.wsj.com.xml.rss.3_7031/3918A8D35025B47AC6A62D293F5F506F")
    """
    _, end = get_title_position(path + ".paf")
    with decode_open(path, "r", "utf8", "ignore") as doc:
        content = doc.read()
        return "".join(content[end:])
def get_document_content_paf(path):
    """
    Content extractor for PAF file
    Exclude the title and get the actual content of the document

    >>> c = get_document_content_paf("/group/home/puls/Shared/capitalization-recovery/30/online.wsj.com.xml.rss.3_7031/3918A8D35025B47AC6A62D293F5F506F")
    """
    _, end = get_title_position(path + ".paf")
    with decode_open(path, "r", "utf8", "ignore") as doc:
        content = doc.read()
        return "".join(content[end:])
Beispiel #7
0
def get_title_and_content_by_paf(path):
    """
    Return:
    the content before the title,
    the title
    the body
    """
    start, end = get_title_position(path + ".paf")
    with decode_open(path, "r", "utf8", "ignore") as doc:
        content = doc.read()
        return ("".join(content[:start]), "".join(content[start:end]),
                "".join(content[end:]))
def get_title_and_content_by_paf(path):
    """
    Return:
    the content before the title,
    the title
    the body
    """
    start, end = get_title_position(path + ".paf")
    with decode_open(path, "r", "utf8", "ignore") as doc:
        content = doc.read()
        return ("".join(content[:start]),
                "".join(content[start:end]),
                "".join(content[end:]))
Beispiel #9
0
def get_title_position(path):
    """
    >>> get_title_position("/group/home/puls/Shared/capitalization-recovery/30/online.wsj.com.xml.rss.3_7031/3918A8D35025B47AC6A62D293F5F506F.paf")
    (42, 77)
    """
    with decode_open(path, "r", "utf8") as paf:
        for line in paf:
            match = title_pos_regexp.search(line)
            if match:
                # get title index
                start = int(match.group(1))
                end = int(match.group(2))
                return start, end

    raise Exception("Unable to find start and end position for %s" % path)
def get_title_position(path):
    """
    >>> get_title_position("/group/home/puls/Shared/capitalization-recovery/30/online.wsj.com.xml.rss.3_7031/3918A8D35025B47AC6A62D293F5F506F.paf")
    (42, 77)
    """
    with decode_open(path, "r", "utf8") as paf:        
        for line in paf:
            match = title_pos_regexp.search(line)
            if match:
                # get title index
                start = int(match.group(1))
                end = int(match.group(2))
                return start, end

    raise Exception("Unable to find start and end position for %s" %path)
def save_content(content, original_file_path, target_directory = "/group/home/puls/Shared/capitalization-recovery/reuters-text/"):
    """
    Save the content somewhere, return the saved path

    >>> save_content("something", "/group/home/puls/Shared/capitalization-recovery/RCV1/REUTERS_CORPUS_1/sth.xml", target_directory = "/group/home/puls/Shared/capitalization-recovery/reuters-text/")
    '/group/home/puls/Shared/capitalization-recovery/reuters-text/sth.txt'
    >>> open("/group/home/puls/Shared/capitalization-recovery/reuters-text/sth.txt").read()
    'something'
    """

    original_file_name = os.path.basename(original_file_path)
    file_name = original_file_name.split(".")[0] + ".txt"
    content_path = os.path.join(target_directory, file_name)
    with decode_open(content_path, "w", "utf8") as f:
        f.write(content)
    return content_path
Beispiel #12
0
def extract_title(path):
    """
    Given document file path
    Extract the title

    >>> extract_title("/group/home/puls/Shared/capitalization-recovery/12/www.ameinfo.com.rssfeeds.10660/DE01D30EA383DFD9FA1427CB9CC935F2")
    u'Polaroid launches new range of products at opening day of GITEX Technology Week 2014'
    >>> extract_title("/group/home/puls/Shared/capitalization-recovery/30/online.wsj.com.xml.rss.3_7031/3918A8D35025B47AC6A62D293F5F506F")
    u'Bad Bets Rock Fortress\u2019s Macro Fund'
    >>> extract_title("/group/home/puls/Shared/capitalization-recovery/30/feeds.foxbusiness.com.foxbusiness/E1D1899ED1CDEAB1574C1D279CBA2632")
    u'Is Gold\u2019s Knockout Punch Coming?'
    >>> extract_title("/group/home/puls/Shared/capitalization-recovery/30/www.streetinsider.com.freefeed.php/34D4137A7AEB5118C6E9EC451E66B529") 
    u'Solving IT Debuts on Staffing Industry Analysts\u2019 Top 100 Fastest-Growing U.S. Staffing and Talent Engagement Firms'
    """
    start, end = get_title_position(path + ".paf")
    with decode_open(path, "r", "utf8") as doc:
        #extract the content
        content = doc.read()
        return "".join(content[start:end])
def extract_title(path):
    """
    Given document file path
    Extract the title

    >>> extract_title("/group/home/puls/Shared/capitalization-recovery/12/www.ameinfo.com.rssfeeds.10660/DE01D30EA383DFD9FA1427CB9CC935F2")
    u'Polaroid launches new range of products at opening day of GITEX Technology Week 2014'
    >>> extract_title("/group/home/puls/Shared/capitalization-recovery/30/online.wsj.com.xml.rss.3_7031/3918A8D35025B47AC6A62D293F5F506F")
    u'Bad Bets Rock Fortress\u2019s Macro Fund'
    >>> extract_title("/group/home/puls/Shared/capitalization-recovery/30/feeds.foxbusiness.com.foxbusiness/E1D1899ED1CDEAB1574C1D279CBA2632")
    u'Is Gold\u2019s Knockout Punch Coming?'
    >>> extract_title("/group/home/puls/Shared/capitalization-recovery/30/www.streetinsider.com.freefeed.php/34D4137A7AEB5118C6E9EC451E66B529") 
    u'Solving IT Debuts on Staffing Industry Analysts\u2019 Top 100 Fastest-Growing U.S. Staffing and Talent Engagement Firms'
    """
    start, end = get_title_position(path + ".paf")
    with decode_open(path, "r", "utf8") as doc:
        #extract the content
        content = doc.read()
        return "".join(content[start: end])
Beispiel #14
0
def save_content(
    content,
    original_file_path,
    target_directory="/group/home/puls/Shared/capitalization-recovery/reuters-text/"
):
    """
    Save the content somewhere, return the saved path

    >>> save_content("something", "/group/home/puls/Shared/capitalization-recovery/RCV1/REUTERS_CORPUS_1/sth.xml", target_directory = "/group/home/puls/Shared/capitalization-recovery/reuters-text/")
    '/group/home/puls/Shared/capitalization-recovery/reuters-text/sth.txt'
    >>> open("/group/home/puls/Shared/capitalization-recovery/reuters-text/sth.txt").read()
    'something'
    """

    original_file_name = os.path.basename(original_file_path)
    file_name = original_file_name.split(".")[0] + ".txt"
    content_path = os.path.join(target_directory, file_name)
    with decode_open(content_path, "w", "utf8") as f:
        f.write(content)
    return content_path