Beispiel #1
0
def clean_article_as_string(article, defoe_path, os_type):
        
    """
    Clean a article as a single string,
    Handling hyphenated words: combine and split and also fixing the long-s

    :param article: Article
    :type article: defoe.papers.article.Article
    :return: clean article words as a string
    :rtype: string or unicode
    """
    article_string = ''
    for word in article.words:
        if article_string == '':
            article_string = word
        else:
            article_string += (' ' + word)

    article_separete = article_string.split('- ')
    article_combined = ''.join(article_separete)
  
    if (len(article_combined) > 1) and ('f' in article_combined): 
       article_clean = longsfix_sentence(article_combined, defoe_path, os_type) 
       return article_clean
    else:
        return article_combined
Beispiel #2
0
def clean_text_as_string(text, flag, defoe_path, os_type):
    """
    Clean a text as a single string,
    Handling hyphenated words: combine and split and also fixing the long-s

    """
    if flag == 2:
        text_combined=text
    else:
        text_string = ''
        for word in text:
            if text_string == '':
                text_string = word 
            else:
                text_string += (' ' + word)

   
        text_separeted = text_string.split('- ')
        text_combined = ''.join(text_separeted)
   
    if (len(text_combined) > 1) and ('f' in text_combined): 
       
       text_clean = longsfix_sentence(text_combined, defoe_path, os_type) 
    else:
        text_clean= text_combined
    if flag!=2:
        text_final=text_clean.split()
    else:
        text_final=text_clean
    text_string_final = ''
    for word in text_final:
        if flag == 0 :
            if "." not in word:
                separated_str = re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ', word)
            else:
                separated_str = word

            if text_string_final == '':
                 text_string_final = separated_str
            else:
                text_string_final += (' ' + separated_str)
        else:
            separated_str = re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ', word)
            if text_string_final == '':
                 text_string_final = separated_str
            else:
                text_string_final += separated_str

    return text_string_final
def clean_page_as_string(page):
    """
    Clean a page as a single string,
    Handling hyphenated words: combine and split and also fixing the long-s

    :param page: Page
    :type page: defoe.nls.Page
    :return: clean page words as a string
    :rtype: string or unicode
    """
    page_string = ''
    for word in page.words:
        if page_string == '':
            page_string = word
        else:
            page_string += (' ' + word)

    page_separeted = page_string.split('- ')
    page_combined = ''.join(page_separeted)

    if (len(page_combined) > 1) and ('f' in page_combined):

        page_clean = longsfix_sentence(page_combined)
    else:
        page_clean = page_combined

    page_final = page_clean.split()
    page_string_final = ''
    for word in page_final:
        if "." not in word:
            separated_str = re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))',
                                   r'\1 ', word)
        else:
            separated_str = word

        if page_string_final == '':
            page_string_final = separated_str
        else:
            page_string_final += (' ' + separated_str)

    return page_string_final