Beispiel #1
0
def segment_sentences(text, ext='asc', splitter=split_sentences_nltk):
    """ Return a list of all sentences and empty lines.

    TODO:
        1. process each line with an agressive sentence segmenter, like DetectorMorse
        2. process our manuscript to create a complete-sentence and heading training set normalized/simplified
           syntax net tree is the input feature set common words and N-grams inserted with their label as additional feature
        3. process a training set with a grammar checker and sentax next to bootstrap a "complete sentence" labeler.
        4. process each 1-3 line window (breaking on empty lines) with syntax net to label them
        5. label each 1-3-line window of lines as "complete sentence, partial sentence/phrase, or multi-sentence"

    >>> segment_se
    """
    sentences = []
    for filemeta in find_files(filepath, ext=ext):
        with open(filemeta['path'], 'rt') as fin:
            batch = []
            for i, line in enumerate(fin):
                if not line.strip():
                    sentences.extend(splitter('\n'.join(batch)))
                    batch = [line]  # may contain all whitespace
                else:
                    batch.append(line)
            if len(batch):
                sentences.extend(
                    splitter('\n'.join(batch))
                )  # TODO: tag sentences with line + filename where they started
    return sentences
Beispiel #2
0
def segment_sentences(path=os.path.join(DATA_PATH, 'book'),
                      splitter=split_sentences_nltk,
                      **find_files_kwargs):
    """ Return a list of all sentences and empty lines.

    TODO:
        1. process each line with an aggressive sentence segmenter, like DetectorMorse
        2. process our manuscript to create a complete-sentence and heading training set normalized/simplified
           syntax net tree is the input feature set common words and N-grams inserted with their label as additional feature
        3. process a training set with a grammar checker and syntax to bootstrap a "complete sentence" labeler.
        4. process each 1-3 line window (breaking on empty lines) with syntax net to label them
        5. label each 1-3-line window of lines as "complete sentence, partial sentence/phrase, or multi-sentence"

    >>> len(segment_sentences(path=os.path.join(DATA_PATH, 'book')))
    ...
    4
    >>> len(segment_sentences(path=os.path.join(DATA_PATH, 'psychology-scripts.txt'), splitter=split_sentences_nltk))
    ...
    23
    """
    sentences = []
    if os.path.isdir(path):
        for filemeta in find_files(path, **find_files_kwargs):
            with open(filemeta['path']) as fin:
                i, batch = 0, []
                try:
                    for i, line in enumerate(fin):
                        if not line.strip():
                            sentences.extend(splitter('\n'.join(batch)))
                            batch = [line]  # may contain all whitespace
                        else:
                            batch.append(line)
                except (UnicodeDecodeError, IOError):
                    logger.error(
                        'UnicodeDecodeError or IOError on line {} in file {} from stat: {}'
                        .format(i + 1, fin.name, filemeta))
                    raise

                if len(batch):
                    # TODO: tag sentences with line + filename where they started
                    sentences.extend(splitter('\n'.join(batch)))
    else:
        batch = []
        for i, line in enumerate(iter_lines(path)):
            # TODO: filter out code and meta lines using asciidoc or markdown parser
            # split into batches based on empty lines
            if not line.strip():
                sentences.extend(splitter('\n'.join(batch)))
                # first line may contain all whitespace
                batch = [line]
            else:
                batch.append(line)
        if len(batch):
            # TODO: tag sentences with line + filename where they started
            sentences.extend(splitter('\n'.join(batch)))

    return sentences
Beispiel #3
0
def generate_sentences(text='', train_path=None, case_sensitive=True, ext=['.md', '.txt', '.asc', '.asciidoc'],
                       normalize_ordinals=1, normalize_newlines=1, normalize_sentence_boundaries=1,
                       epochs=20, classifier=nlup.BinaryAveragedPerceptron,
                       re_eol=r'\r\n|\r|\n', **kwargs):
    """Generate sentences from a sequence of characters (text)

    Wrapped text (newlines at column 80, for instance) will break this, breaking up sentences.
    Wrapper and preprocessor for Kyle Gorman's "DetectorMorse" module

    Arguments:
      preprocess (bool): whether to assume common sentence delimitters in markdown and asciidoc formatting
                         using r'[.?!][ \t]*\n\n|[.?!][ \t]*\r\n\r\n|[.?!][ \t]*\r\r|[.?!][ ][ ][A-Z]'
      case_sensitive (int): whether to consider case to make decisions about sentence boundaries
      epochs (int): number of epochs (iterations for classifier training)

    """
    ext = [ext] if isinstance(ext, basestring) else ext
    if isinstance(text, basestring) and len(text) <= 256:
        if os.path.isfile(text) and os.path.splitext(text)[-1].lower() in ext:
            text = open(text)
        elif os.path.isdir(text):
            return chain.from_iterable((
                generate_sentences(text=stat['path'], train_path=train_path, ext=ext,
                                   normalize_ordinals=normalize_ordinals, normalize_newlines=normalize_ordinals,
                                   normalize_sentence_boundaries=normalize_sentence_boundaries,
                                   epochs=epochs, classifier=classifier, re_eol=re_eol, **kwargs)
                for stat in find_files(text, ext=ext)))
    if isinstance(text, basestring):
        texts = Split(text=text, re_delim=re_eol)
    else:
        texts = chain.from_iterable(Split(text=doc, re_delm=re_eol) for doc in text)

    if normalize_newlines:
        re_eol = re.compile(r'\r\n|\r')
        texts = (re_eol.sub(r'\n', doc) for doc in texts)
    if normalize_ordinals:
        re_ord = re.compile(r'\b([0-9]+|[A-Za-z])[.?!][ \t]{1,4}([A-Za-z])')
        texts = (re_ord.sub(r'\1) \2', doc) for doc in texts)
    if normalize_sentence_boundaries:
        re_eos = re.compile(r'([.?!])([ ][ ])[\n]?([A-Z])')
        texts = (re_eos.sub(r'\1\n\3', doc) for doc in texts)

    if train_path:
        generate_sentences.detector = Detector(slurp(train_path), epochs=epochs, nocase=not case_sensitive)
    elif not isinstance(getattr(generate_sentences, 'detector', None), Detector):
        generate_sentences.detector = Detector.load(
            os.path.join(DATA_PATH, 'wsj_pugnlp.detector_morse.Detector.json.gz'))
    # generate_sentences.detector = SentenceDetector(text=text, nocase=not case_sensitive,
    # epochs=epochs, classifier=classifier)
    return iter(chain.from_iterable((s.lstrip() for s in generate_sentences.detector.segments(text)) for text in texts))
Beispiel #4
0
def segment_sentences(path=os.path.join(DATA_PATH, 'book'), ext='asc', splitter=split_sentences_nltk):
    """ Return a list of all sentences and empty lines.

    TODO:
        1. process each line with an agressive sentence segmenter, like DetectorMorse
        2. process our manuscript to create a complete-sentence and heading training set normalized/simplified
           syntax net tree is the input feature set common words and N-grams inserted with their label as additional feature
        3. process a training set with a grammar checker and sentax next to bootstrap a "complete sentence" labeler.
        4. process each 1-3 line window (breaking on empty lines) with syntax net to label them
        5. label each 1-3-line window of lines as "complete sentence, partial sentence/phrase, or multi-sentence"

    >>> len(segment_sentences(path=os.path.join(DATA_PATH, 'book')))
    8324
    >>> len(segment_sentences(path=os.path.join(DATA_PATH, 'book',
    ...     'Chapter 00 -- Preface.asc'), splitter=split_sentences_nltk))
    139
    >>> len(segment_sentences(path=os.path.join(DATA_PATH, 'book',
    ...     'Chapter 01 -- Packets of Thought (Basic NLP).asc'), splitter=split_sentences_nltk))
    585
    """
    sentences = []
    if os.path.isdir(path):
        for filemeta in find_files(path, ext=ext):
            with open(filemeta['path'], 'rt') as fin:
                batch = []
                for i, line in enumerate(fin):
                    if not line.strip():
                        sentences.extend(splitter('\n'.join(batch)))
                        batch = [line]  # may contain all whitespace
                    else:
                        batch.append(line)
                if len(batch):
                    # TODO: tag sentences with line + filename where they started
                    sentences.extend(splitter('\n'.join(batch)))
    else:
        batch = []
        for i, line in enumerate(iter_lines(path)):
            # TODO: filter out code and meta lines using asciidoc or markdown parser
            # split into batches based on empty lines
            if not line.strip():
                sentences.extend(splitter('\n'.join(batch)))
                # first line may contain all whitespace
                batch = [line]
            else:
                batch.append(line)
        if len(batch):
            # TODO: tag sentences with line + filename where they started
            sentences.extend(splitter('\n'.join(batch)))

    return sentences
Beispiel #5
0
def minify_urls(filepath,
                ext='asc',
                url_regex=None,
                output_ext='.urls_minified',
                access_token=None):
    """ Use bitly or similar minifier to shrink all URLs in text files within a folder structure.

    Used for the NLPIA manuscript directory for Manning Publishing

    bitly API: https://dev.bitly.com/links.html

    Args:
      path (str): Directory or file path
      ext (str): File name extension to filter text files by. default='.asc'
      output_ext (str): Extension to append to filenames of altered files default='' (in-place replacement of URLs)

    FIXME: NotImplementedError! Untested!
    """
    access_token = access_token or secrets.bitly.access_token
    output_ext = output_ext or ''
    url_regex = regex.compile(url_regex) if isinstance(url_regex,
                                                       str) else url_regex
    filemetas = []
    for filemeta in find_files(filepath, ext=ext):
        filemetas += [filemeta]
        altered_text = ''
        with open(filemeta['path'], 'rt') as fin:
            text = fin.read()
        end = 0
        for match in url_regex.finditer(text):
            url = match.group()
            start = match.start()
            altered_text += text[:start]
            resp = requests_get(
                'https://api-ssl.bitly.com/v3/shorten?access_token={}&longUrl={}'
                .format(access_token, url),
                allow_redirects=True,
                timeout=5)
            js = resp.json()
            short_url = js['shortUrl']
            altered_text += short_url
            end = start + len(url)
        altered_text += text[end:]
        with open(filemeta['path'] + (output_ext or ''), 'wt') as fout:
            fout.write(altered_text)
    return altered_text
Beispiel #6
0
def segment_sentences(filepath, ext='asc'):
    """ Insert and delete newlines in a text document to produce once sentence or heading per line.

    Lines are labeled with their classification as "sentence" or "phrase" (e.g title or heading)

    1. process each line with an agressive sentence segmenter, like DetectorMorse
    2. process our manuscript to create a complete-sentence and heading training set normalized/simplified syntax net tree is the input feature set
       common words and N-grams inserted with their label as additional feature
    3. process a training set with a grammar checker and sentax next to bootstrap a "complete sentence" labeler.
    4. process each 1-3 line window (breaking on empty lines) with syntax net to label them
    5. label each 1-3-line window of lines as "complete sentence, partial sentence/phrase, or multi-sentence"
    """
    for filemeta in find_files(filepath, ext=ext):
        altered_text = ''
        with open(filemeta['path'], 'rt') as fin:
            for line in fin:
                altered_text += line
Beispiel #7
0
def iter_lines(url_or_text, ext=None, mode='rt'):
    r""" Return an iterator over the lines of a file or URI response.

    >>> len(list(iter_lines('cats_and_dogs.txt')))
    263
    >>> len(list(iter_lines(list('abcdefgh'))))
    8
    >>> len(list(iter_lines('abc\n def\n gh\n')))
    3
    >>> len(list(iter_lines('abc\n def\n gh')))
    3
    >>> 20000 > len(list(iter_lines(BOOK_PATH))) > 200
    True
    """
    if url_or_text is None or not url_or_text:
        return []
        # url_or_text = 'https://www.fileformat.info/info/charset/UTF-8/list.htm'
    elif isinstance(url_or_text, (str, bytes, basestring)):
        if '\n' in url_or_text or '\r' in url_or_text:
            return StringIO(url_or_text)
        elif os.path.isfile(os.path.join(DATA_PATH, url_or_text)):
            return open(os.path.join(DATA_PATH, url_or_text), mode=mode)
        elif os.path.isfile(url_or_text):
            return open(os.path.join(url_or_text), mode=mode)
        if os.path.isdir(url_or_text):
            filepaths = [
                filemeta['path']
                for filemeta in find_files(url_or_text, ext=ext)
            ]
            return itertools.chain.from_iterable(map(open, filepaths))
        url = looks_like_url(url_or_text)
        if url:
            for i in range(3):
                return requests_get(url,
                                    stream=True,
                                    allow_redirects=True,
                                    timeout=5)
        else:
            return StringIO(url_or_text)
    elif isinstance(url_or_text, (list, tuple)):
        # FIXME: make this lazy with chain and map so it doesn't gobble up RAM
        text = ''
        for s in url_or_text:
            text += '\n'.join(list(iter_lines(s, ext=ext, mode=mode))) + '\n'
        return iter_lines(text)
Beispiel #8
0
def generate_lines(text, ext=['.txt', '.md', '.rst', '.asciidoc', '.asc']):
    r""" Yield text one line at a time from from a single file path, files in a directory, or a text string

    >>> list(generate_lines('Hello crazy\r\nMS/Apple world\rof EOLS.\n'))
    ['Hello crazy\r\n', 'MS/Apple world\r', 'of EOLS.\n']
    """

    if isinstance(text, basestring):
        if len(text) <= 256:
            if os.path.isfile(text) and os.path.splitext(text)[-1].lower() in ext:
                return open(text)
            elif os.path.isdir(text):
                return chain.from_iterable(generate_lines(stat['path']) for stat in find_files(text, ext=ext))
            else:
                return (line for line in Split(text=text))
        else:
            return Split(text=text)
    return chain.from_iterable(generate_lines(obj) for obj in text)
Beispiel #9
0
def iter_lines(url_or_text, ext=None, mode='rt'):
    r""" Return an iterator over the lines of a file or URI response.

    >>> len(list(iter_lines('cats_and_dogs.txt')))
    263
    >>> len(list(iter_lines(list('abcdefgh'))))
    8
    >>> len(list(iter_lines('abc\n def\n gh\n')))
    3
    >>> len(list(iter_lines('abc\n def\n gh')))
    3
    >>> len(list(iter_lines(os.path.join(DATA_PATH, 'book'))))
    3
    """
    if url_or_text is None or not url_or_text:
        return []
        # url_or_text = 'https://www.fileformat.info/info/charset/UTF-8/list.htm'
    elif isinstance(url_or_text, basestring):
        if os.path.isfile(os.path.join(DATA_PATH, url_or_text)):
            return open(os.path.join(DATA_PATH, url_or_text), mode=mode)
        elif os.path.isfile(url_or_text):
            return open(os.path.join(url_or_text), mode=mode)
        elif os.path.isdir(url_or_text):
            filepaths = [
                filemeta['path']
                for filemeta in find_files(url_or_text, ext=ext)
            ]
            return itertools.chain.from_iterable(map(open, filepaths))
        url = is_valid_url(url_or_text)
        if url:
            return requests.get(url, stream=True, allow_redirects=True)
        else:
            return StringIO(url_or_text)
    elif isinstance(url_or_text, (list, tuple)):
        return itertools.chain.from_iterable(
            map(iter_lines, filepaths, ext=ext))
Beispiel #10
0
    >>> print(get_wikidata_qnum(wikiarticle="Andromeda Galaxy", wikisite="enwiki"))
    Q2469
    """
    resp = requests.get(
        'https://www.wikidata.org/w/api.php', {
            'action': 'wbgetentities',
            'titles': wikiarticle,
            'sites': wikisite,
            'props': '',
            'format': 'json'
        }).json()
    return list(resp['entities'])[0]


DATASET_FILENAMES = [
    f['name'] for f in find_files(DATA_PATH, '.csv.gz', level=0)
]
DATASET_FILENAMES += [
    f['name'] for f in find_files(DATA_PATH, '.csv', level=0)
]
DATASET_FILENAMES += [
    f['name'] for f in find_files(DATA_PATH, '.json', level=0)
]
DATASET_FILENAMES += [
    f['name'] for f in find_files(DATA_PATH, '.txt', level=0)
]
DATASET_NAMES = sorted([
    f[:-4] if f.endswith('.csv') else f
    for f in [os.path.splitext(f)[0] for f in DATASET_FILENAMES]
])
DATASET_NAME2FILENAME = dict(zip(DATASET_NAMES, DATASET_FILENAMES))
Beispiel #11
0
    >>> print(get_wikidata_qnum(wikiarticle="Andromeda Galaxy", wikisite="enwiki"))
    Q2469
    """
    resp = requests.get(
        'https://www.wikidata.org/w/api.php', {
            'action': 'wbgetentities',
            'titles': wikiarticle,
            'sites': wikisite,
            'props': '',
            'format': 'json'
        }).json()
    return list(resp['entities'])[0]


DATASET_FILENAMES = [f['name'] for f in find_files(DATA_PATH, '.csv.gz')]
DATASET_FILENAMES += [f['name'] for f in find_files(DATA_PATH, '.csv')]
DATASET_FILENAMES += [f['name'] for f in find_files(DATA_PATH, '.json')]
DATASET_FILENAMES += [f['name'] for f in find_files(DATA_PATH, '.txt')]
DATASET_NAMES = sorted([
    f[:-4] if f.endswith('.csv') else f
    for f in [os.path.splitext(f)[0] for f in DATASET_FILENAMES]
])
DATASET_NAME2FILENAME = dict(zip(DATASET_NAMES, DATASET_FILENAMES))


def str2int(s):
    s = ''.join(c for c in s if c in '0123456789')
    return int(s or -MAX_INT)

def ubuntu_dialog_raw(args):
    dfs = []
    for stat in find_files(args.data_root, ext='.tsv'):
        df = pd.read_tsv(stat['path'])
        dfs.append(df['dialog_id'])
    return d
Beispiel #13
0
    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "labeler_site.settings")
    try:
        from django.core.management import execute_from_command_line
    except ImportError:
        # The above import may fail for some other reason. Ensure that the
        # issue is really that Django is missing to avoid masking other
        # exceptions on Python 2.
        try:
            import django  # noqa
        except ImportError:
            raise ImportError(
                "Couldn't import Django. Are you sure it's installed and "
                "available on your PYTHONPATH environment variable? Did you "
                "forget to activate a virtual environment?")
        raise

    try:
        from django.conf import settings
        from pugnlp.futil import find_files

        # delete all `.pyc` files
        for ff in find_files(settings.BASE_DIR, '.pyc'):
            if ff['path'].endswith(
                    '.pyc'
            ):  # double check that find_files is correct about file extension
                os.remove(ff['path'])
    except ImportError:
        print("WARN: unable to delete all pyc files until you install pugnlp.")

    execute_from_command_line(sys.argv)