def feature_extraction(corpus_path, output, features): '''Perform a feature extraction''' download_corpus(corpus_path) setup_tokenizers(terminal_punctuation=('.', '?', '!'), language='english') qcrit.extract_features.main( corpus_dir=os.path.join(*corpus_path), file_extension_to_parse_function={ 'txt': parse_txt, }, output_file=output, features=features, )
def main(): '''Main''' corpus_path = ('tesserae', 'texts', 'grc') download_corpus(corpus_path) #'FULL STOP', 'SEMICOLON', 'GREEK QUESTION MARK' setup_tokenizers(terminal_punctuation=('.', ';', ';')) if len(sys.argv) > 2 and sys.argv[2] == '-u': import qcrit.features.universal_features #seemingly unused, but allows the recognition of features else: import qcrit.features.ancient_greek_features #seemingly unused, but allows the recognition of features #Feature extractions qcrit.extract_features.main( os.path.join(*corpus_path), {'tess': qcrit.extract_features.parse_tess}, #Exclude all files of genres not specified. Exclude composite files no matter what excluded_paths=composite_files, output_file=None if len(sys.argv) <= 1 else sys.argv[1])
# -*- coding: utf-8 -*- #pylint: disable = missing-docstring, blacklisted-name, unused-argument, invalid-name, line-too-long, protected-access import unittest import re from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktLanguageVars import context #pylint: disable=unused-import from qcrit import textual_feature #[^\s\d’”\'\")\)\]\}\.,:;] #[“‘—\-†&vâ\*\^(α-ωΑ-Ὠ`̔] #΄´´``′″‴ textual_feature.setup_tokenizers(terminal_punctuation=('.', ';', ';')) p = PunktLanguageVars() #TODO don't mess with the PunktLanguageVars instance variables, mess with the class variables p._re_word_tokenizer = re.compile( PunktLanguageVars._word_tokenize_fmt % { 'NonWord': r"(?:[\d\.\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡])", 'MultiChar': PunktLanguageVars._re_multi_char_punct, 'WordStart': r"[^\d\.\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡]", }, re.UNICODE | re.VERBOSE) p._re_period_context = re.compile( PunktLanguageVars._period_context_fmt % { 'NonWord': r"(?:[\d\.\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡])", 'SentEndChars': p._re_sent_end_chars, }, re.UNICODE | re.VERBOSE)
#pylint: disable = missing-docstring, blacklisted-name, unused-argument, invalid-name '''Test feature extraction''' import unittest import context #pylint: disable=unused-import from qcrit.extract_features import main, parse_tess from qcrit.textual_feature import textual_feature, setup_tokenizers #Run this file with "-b" to ignore output in passing tests (failing tests still display output) setup_tokenizers(terminal_punctuation=('.', ';', ';')) #'FULL STOP', 'SEMICOLON', 'GREEK QUESTION MARK' @textual_feature(tokenize_type='words', debug=True) def dummy_feature(text): pass class TestExtractFeatures(unittest.TestCase): def testAllNone(self): self.assertRaises(ValueError, main, corpus_dir=None, file_extension_to_parse_function=None) def testInvalidCorpusDirectory(self): self.assertRaises(ValueError, main, corpus_dir='abc', file_extension_to_parse_function={'tess': parse_tess}) def testExcludedPaths(self): self.assertRaises(ValueError, main, corpus_dir='.', file_extension_to_parse_function={'tess': parse_tess}, excluded_paths=[]) def testEmptyFeatures(self): self.assertRaises(ValueError, main, corpus_dir='.', file_extension_to_parse_function={'tess': parse_tess}, features=[]) def testOutputAlreadyExists(self):
''' Latin features ''' import os import subprocess import sys import re import qcrit.extract_features from qcrit.textual_feature import textual_feature, setup_tokenizers CORPUS_DIR = os.path.join('tesserae', 'texts', 'la') TERMINAL_PUNCTUATION = ('.', '?', '!') setup_tokenizers(terminal_punctuation=TERMINAL_PUNCTUATION) def _download_corpus(): ''' Downloads latin corpus from tesserae Sparse checkout reference here: #https://stackoverflow.com/a/28039894/7102572 ''' if not os.path.isdir('tesserae'): try: cmd_list = ( 'mkdir tesserae', 'git -C tesserae init', 'git -C tesserae remote add origin https://github.com/timgianitsos/tesserae.git', 'git -C tesserae config core.sparseCheckout true',
from shlex import quote _CURRENT_DIR = os.path.dirname(__file__) #If the output file already exists, the feature extraction code will not override it #Delete the output file so that the demo can create one if os.path.isfile(os.path.join(_CURRENT_DIR, 'output.pickle')): os.system('rm ' + quote(os.path.join(_CURRENT_DIR, 'output.pickle'))) import context #pylint: disable=unused-import import qcrit.extract_features from qcrit.textual_feature import textual_feature, setup_tokenizers from functools import reduce from unicodedata import normalize #Let sentence tokenizer know that periods and semicolons are the punctuation marks that end sentences setup_tokenizers(terminal_punctuation=('.', ';')) #Using 'words' makes the input 'text' parameter become a list of words @textual_feature(tokenize_type='words') def num_conjunctions(text): #parameter must be the text of a file return reduce( lambda count, word: count + (1 if word in { normalize('NFD', val) for val in ['καί', 'καὶ', 'ἀλλά', 'ἀλλὰ', 'ἤ', 'ἢ'] } else 0), text, 0) #Using 'sentences' makes the input 'text' parameter become a list of sentences @textual_feature(tokenize_type='sentences') def mean_sentence_length(text): #parameter must be the text of a file