コード例 #1
0
ファイル: query.py プロジェクト: TylerKirby/cltk
def search_corpus(pattern, corpus, context, case_insensitive=True, expand_keyword=False, lemmatized=False, threshold=0.70):
    """Search for pattern in TLG or PHI5.
    TODO: Cleanup hyphenation.
    """

    corpora = ['tlg', 'phi5']
    assert corpus in corpora, "Available corpora: '{}'.".format(corpora)

    if type(context) is str:
        contexts = ['sentence', 'paragraph']
        assert context in contexts or type(context) is int, 'Available contexts: {}'.format(contexts)
    else:
        context = int(context)

    if corpus == 'phi5':
        lang = 'latin'
        index = PHI5_INDEX
        paths = assemble_phi5_author_filepaths()
    elif corpus == 'tlg':
        index = TLG_INDEX
        lang = 'greek'
        paths = assemble_tlg_author_filepaths()

    if expand_keyword:
        # Strip off all regex characters from pattern for Word2Vec lookup
        # First rm escaped chars
        # TODO: Add '\u', '\U', '\x' to this list
        escapes_list = [r'\a', r'\b', r'\f', r'\n', r'\r', r'\t', r'\v', r'\\']
        escapes_str = '|'.join(escapes_list)
        comp_escapes = regex.compile(escapes_str, flags=regex.VERSION1)
        pattern = comp_escapes.sub('', pattern)
        # Second rm remaining punctuation
        punctuation = set(string.punctuation)
        pattern = ''.join(ch for ch in pattern if ch not in punctuation)
        similar_vectors = _keyword_expander(pattern, lang, lemmatized=lemmatized, threshold=threshold)
        print("The following similar terms will be added to the '{0}' query: '{1}'.".format(pattern, similar_vectors))
        pattern = [pattern]
        if similar_vectors:
            pattern += similar_vectors
        else:
            pattern = pattern

    for path in paths:
        with open(path) as file_open:
            text = file_open.read()
        for one_pattern in pattern:
            _matches = match_regex(text, one_pattern, language=lang, context=context, case_insensitive=case_insensitive)
            for _match in _matches:
                _id = os.path.split(path)[1][:-4]
                author = index[_id]
                yield (author, _match)
コード例 #2
0
ファイル: frequency.py プロジェクト: eamonnbell/cltk
    def _assemble_corpus_string(self, corpus):
        """Takes a list of filepaths, returns a string containing contents of
        all files."""

        if corpus == 'phi5':
            filepaths = assemble_phi5_author_filepaths()
            file_cleaner = phi5_plaintext_cleanup
        elif corpus == 'tlg':
            filepaths = assemble_tlg_author_filepaths()
            file_cleaner = tlg_plaintext_cleanup

        for filepath in filepaths:
            with open(filepath) as file_open:
                file_read = file_open.read().lower()
            file_clean = file_cleaner(file_read)
            yield file_clean
コード例 #3
0
    def _assemble_corpus_string(self, corpus):
        """Takes a list of filepaths, returns a string containing contents of
        all files."""

        if corpus == 'phi5':
            filepaths = assemble_phi5_author_filepaths()
            file_cleaner = phi5_plaintext_cleanup
        elif corpus == 'tlg':
            filepaths = assemble_tlg_author_filepaths()
            file_cleaner = tlg_plaintext_cleanup

        for filepath in filepaths:
            with open(filepath) as file_open:
                file_read = file_open.read().lower()
            file_clean = file_cleaner(file_read)
            yield file_clean
コード例 #4
0
ファイル: test_corpus.py プロジェクト: ykl7/cltk
 def test_assemble_phi5_author(self):
     """Test building absolute filepaths from TLG index."""
     paths = assemble_phi5_author_filepaths()
     self.assertEqual(len(paths), 362)
コード例 #5
0
ファイル: test_corpus.py プロジェクト: mcneela/cltk
 def test_assemble_phi5_author(self):
     """Test building absolute filepaths from TLG index."""
     paths = assemble_phi5_author_filepaths()
     self.assertEqual(len(paths), 362)
コード例 #6
0
ファイル: word2vec.py プロジェクト: cltk/cltk
def gen_docs(corpus, lemmatize, rm_stops):
    """Open and process files from a corpus. Return a list of sentences for an author. Each sentence
    is itself a list of tokenized words.
    """

    assert corpus in ['phi5', 'tlg']

    if corpus == 'phi5':
        language = 'latin'
        filepaths = assemble_phi5_author_filepaths()
        jv_replacer = JVReplacer()
        text_cleaner = phi5_plaintext_cleanup
        word_tokenizer = WordTokenizer('latin')
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    elif corpus == 'tlg':
        language = 'greek'
        filepaths = assemble_tlg_author_filepaths()
        text_cleaner = tlg_plaintext_cleanup
        word_tokenizer = WordTokenizer('greek')

        if rm_stops:
            stops = latin_stops
        else:
            stops = None

    if lemmatize:
        lemmatizer = LemmaReplacer(language)

    sent_tokenizer = TokenizeSentence(language)

    for filepath in filepaths:
        with open(filepath) as f:
            text = f.read()
        # light first-pass cleanup, before sentence tokenization (which relies on punctuation)
        text = text_cleaner(text, rm_punctuation=False, rm_periods=False)
        sent_tokens = sent_tokenizer.tokenize_sentences(text)
        # doc_sentences = []
        for sentence in sent_tokens:
            # a second cleanup at sentence-level, to rm all punctuation
            sentence = text_cleaner(sentence, rm_punctuation=True, rm_periods=True)
            sentence = word_tokenizer(sentence)
            sentence = [s.lower() for s in sentence]
            sentence = [w for w in sentence if w]
            if language == 'latin':
                sentence = [w[1:] if w.startswith('-') else w for w in sentence]

            if stops:
                sentence = [w for w in sentence if w not in stops]

            sentence = [w for w in sentence if len(w) > 1]  # rm short words

            if sentence:
                sentence = sentence

            if lemmatize:
                sentence = lemmatizer.lemmatize(sentence)
            if sentence and language == 'latin':
                sentence = [jv_replacer.replace(word) for word in sentence]
            if sentence:
                yield sentence
コード例 #7
0
def search_corpus(pattern,
                  corpus,
                  context,
                  case_insensitive=True,
                  expand_keyword=False,
                  lemmatized=False,
                  threshold=0.70):
    """Search for pattern in TLG or PHI5.
    TODO: Cleanup hyphenation.
    """

    corpora = ['tlg', 'phi5']
    assert corpus in corpora, "Available corpora: '{}'.".format(corpora)

    if type(context) is str:
        contexts = ['sentence', 'paragraph']
        assert context in contexts or type(
            context) is int, 'Available contexts: {}'.format(contexts)
    else:
        context = int(context)

    if corpus == 'phi5':
        lang = 'latin'
        index = PHI5_INDEX
        paths = assemble_phi5_author_filepaths()
    elif corpus == 'tlg':
        index = TLG_INDEX
        lang = 'greek'
        paths = assemble_tlg_author_filepaths()

    if expand_keyword:
        # Strip off all regex characters from pattern for Word2Vec lookup
        # First rm escaped chars
        # TODO: Add '\u', '\U', '\x' to this list
        escapes_list = [r'\a', r'\b', r'\f', r'\n', r'\r', r'\t', r'\v', r'\\']
        escapes_str = '|'.join(escapes_list)
        comp_escapes = regex.compile(escapes_str, flags=regex.VERSION1)
        pattern = comp_escapes.sub('', pattern)
        # Second rm remaining punctuation
        punctuation = set(string.punctuation)
        pattern = ''.join(ch for ch in pattern if ch not in punctuation)
        similar_vectors = _keyword_expander(pattern,
                                            lang,
                                            lemmatized=lemmatized,
                                            threshold=threshold)
        print(
            "The following similar terms will be added to the '{0}' query: '{1}'."
            .format(pattern, similar_vectors))
        pattern = [pattern]
        if similar_vectors:
            pattern += similar_vectors
        else:
            pattern = pattern

    for path in paths:
        with open(path) as file_open:
            text = file_open.read()
        for one_pattern in pattern:
            _matches = match_regex(text,
                                   one_pattern,
                                   language=lang,
                                   context=context,
                                   case_insensitive=case_insensitive)
            for _match in _matches:
                _id = os.path.split(path)[1][:-4]
                author = index[_id]
                yield (author, _match)
コード例 #8
0
ファイル: word2vec.py プロジェクト: vierth/cltk
def gen_docs(corpus, lemmatize, rm_stops):
    """Open and process files from a corpus. Return a list of sentences for an author. Each sentence
    is itself a list of tokenized words.
    """

    assert corpus in ['phi5', 'tlg']

    if corpus == 'phi5':
        language = 'latin'
        filepaths = assemble_phi5_author_filepaths()
        jv_replacer = JVReplacer()
        text_cleaner = phi5_plaintext_cleanup
        word_tokenizer = nltk_tokenize_words
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    elif corpus == 'tlg':
        language = 'greek'
        filepaths = assemble_tlg_author_filepaths()
        text_cleaner = tlg_plaintext_cleanup
        word_tokenizer = nltk_tokenize_words

        if rm_stops:
            stops = latin_stops
        else:
            stops = None

    if lemmatize:
        lemmatizer = LemmaReplacer(language)

    sent_tokenizer = TokenizeSentence(language)

    for filepath in filepaths:
        with open(filepath) as f:
            text = f.read()
        # light first-pass cleanup, before sentence tokenization (which relies on punctuation)
        text = text_cleaner(text, rm_punctuation=False, rm_periods=False)
        sent_tokens = sent_tokenizer.tokenize_sentences(text)
        # doc_sentences = []
        for sentence in sent_tokens:
            # a second cleanup at sentence-level, to rm all punctuation
            sentence = text_cleaner(sentence,
                                    rm_punctuation=True,
                                    rm_periods=True)
            sentence = word_tokenizer(sentence)
            sentence = [s.lower() for s in sentence]
            sentence = [w for w in sentence if w]
            if language == 'latin':
                sentence = [
                    w[1:] if w.startswith('-') else w for w in sentence
                ]

            if stops:
                sentence = [w for w in sentence if w not in stops]

            sentence = [w for w in sentence if len(w) > 1]  # rm short words

            if sentence:
                sentence = sentence

            if lemmatize:
                sentence = lemmatizer.lemmatize(sentence)
            if sentence and language == 'latin':
                sentence = [jv_replacer.replace(word) for word in sentence]
            if sentence:
                yield sentence