def main(): #### Parse command-line arguments parser = argparse.ArgumentParser(description = \ 'Phonological CorpusTools: corpus object creation CL interface') parser.add_argument('csv_file_name', help='Name of input CSV file') parser.add_argument('-f', '--feature_file_name', default = '', type=str, help='Name of input feature file') parser.add_argument('-d', '--delimiter', default='\t', type=str, help='Character that delimits columns in the input file') parser.add_argument('-t', '--trans_delimiter', default='', type=str, help='Character that delimits segments in the input file') args = parser.parse_args() #### delimiter = codecs.getdecoder("unicode_escape")(args.delimiter)[0] try: # Full path specified filename, extension = os.path.splitext(args.csv_file_name) filename = path_leaf(filename) corpus = load_corpus_csv(args.csv_file_name, args.csv_file_name, delimiter, args.trans_delimiter, args.feature_file_name) save_binary(corpus, filename+'.corpus') except FileNotFoundError: #FIXME! os.path.join takes care of os specific paths try: # Unix filepaths filename, extension = os.path.splitext(os.path.dirname(os.path.realpath(__file__))+'/'+args.csv_file_name) corpus = load_corpus_csv(args.csv_file_name, os.path.dirname(os.path.realpath(__file__))+'/'+args.csv_file_name, delimiter, args.trans_delimiter, os.path.dirname(os.path.realpath(__file__))+'/'+args.feature_file_name) save_binary(corpus, filename+'.corpus') except FileNotFoundError: # Windows filepaths filename, extension = os.path.splitext(os.path.dirname(os.path.realpath(__file__))+'\\'+args.csv_file_name) corpus = load_corpus_csv(args.csv_file_name, os.path.dirname(os.path.realpath(__file__))+'\\'+args.csv_file_name, delimiter, args.trans_delimiter, os.path.dirname(os.path.realpath(__file__))+'\\'+args.feature_file_name) save_binary(corpus, filename+'.corpus')
def test_corpus_csv(csv_test_dir, unspecified_test_corpus): example_path = os.path.join(csv_test_dir, 'example.txt') with pytest.raises(DelimiterError): load_corpus_csv('example',example_path,delimiter='\t') #with pytest.raises(DelimiterError): # load_corpus_csv('example',example_path,delimiter=',') c = load_corpus_csv('example',example_path,delimiter=',') assert(isinstance(c, Corpus)) assert(c == unspecified_test_corpus)
def test_stressed(csv_test_dir): stressed_path = os.path.join(csv_test_dir, 'stressed.txt') ats,_ = inspect_csv(stressed_path, coldelim = ',') print(ats) ats[1].number_behavior = 'stress' c = load_corpus_csv('stressed',stressed_path,',', ats) assert(c.inventory['uw'].symbol == 'uw') assert(c.inventory.stresses == {'1': set(['uw','iy']), '0': set(['uw','iy','ah'])})
def run(self): time.sleep(0.1) textType = self.kwargs.pop('text_type') isDirectory = self.kwargs.pop('isDirectory') logging.info('Importing {} corpus named {}'.format( textType, self.kwargs['corpus_name'])) logging.info('Path: '.format(self.kwargs['path'])) log_annotation_types(self.kwargs['annotation_types']) try: if textType == 'spelling': if isDirectory: corpus = load_directory_spelling(**self.kwargs) else: corpus = load_discourse_spelling(**self.kwargs) elif textType == 'transcription': if isDirectory: corpus = load_directory_transcription(**self.kwargs) else: corpus = load_discourse_transcription(**self.kwargs) elif textType == 'ilg': if isDirectory: corpus = load_directory_ilg(**self.kwargs) else: corpus = load_discourse_ilg(**self.kwargs) elif textType == 'textgrid': if isDirectory: corpus = load_directory_textgrid(**self.kwargs) else: corpus = load_discourse_textgrid(**self.kwargs) elif textType == 'csv': corpus = load_corpus_csv(**self.kwargs) elif textType in ['buckeye', 'timit']: self.kwargs['dialect'] = textType if isDirectory: corpus = load_directory_multiple_files(**self.kwargs) else: corpus = load_discourse_multiple_files(**self.kwargs) except PCTError as e: self.errorEncountered.emit(e) return except Exception as e: e = PCTPythonError(e) self.errorEncountered.emit(e) return if self.stopped: time.sleep(0.1) self.finishedCancelling.emit() return self.dataReady.emit(corpus)
def run(self): time.sleep(0.1) textType = self.kwargs.pop('text_type') isDirectory = self.kwargs.pop('isDirectory') logging.info('Importing {} corpus named {}'.format(textType, self.kwargs['corpus_name'])) logging.info('Path: '.format(self.kwargs['path'])) log_annotation_types(self.kwargs['annotation_types']) try: if textType == 'spelling': if isDirectory: corpus = load_directory_spelling(**self.kwargs) else: corpus = load_discourse_spelling(**self.kwargs) elif textType == 'transcription': if isDirectory: corpus = load_directory_transcription(**self.kwargs) else: corpus = load_discourse_transcription(**self.kwargs) elif textType == 'ilg': if isDirectory: corpus = load_directory_ilg(**self.kwargs) else: corpus = load_discourse_ilg(**self.kwargs) elif textType == 'textgrid': if isDirectory: corpus = load_directory_textgrid(**self.kwargs) else: corpus = load_discourse_textgrid(**self.kwargs) elif textType == 'csv': corpus = load_corpus_csv(**self.kwargs) elif textType in ['buckeye', 'timit']: self.kwargs['dialect'] = textType if isDirectory: corpus = load_directory_multiple_files(**self.kwargs) else: corpus = load_discourse_multiple_files(**self.kwargs) except PCTError as e: self.errorEncountered.emit(e) return except Exception as e: e = PCTPythonError(e) self.errorEncountered.emit(e) return if self.stopped: time.sleep(0.1) self.finishedCancelling.emit() return self.dataReady.emit(corpus)
def main(): #### Parse command-line arguments parser = argparse.ArgumentParser(description = \ 'Phonological CorpusTools: corpus object creation CL interface') parser.add_argument('csv_file_name', help='Name of input CSV file') parser.add_argument('-f', '--feature_file_name', default = '', type=str, help='Name of input feature file') parser.add_argument('-d', '--delimiter', default=None, type=str, help='Character that delimits columns in the input file') parser.add_argument('-t', '--trans_delimiter', default=None, type=str, help='Character that delimits segments in the input file') args = parser.parse_args() #### if args.delimiter: delimiter = codecs.getdecoder("unicode_escape")(args.delimiter)[0] else: delimiter = args.delimiter try: # Full path specified filename, extension = os.path.splitext(args.csv_file_name) filename = path_leaf(filename) corpus = load_corpus_csv(args.csv_file_name, args.csv_file_name, delimiter, args.trans_delimiter, annotation_types=None, feature_system_path=args.feature_file_name) save_binary(corpus, filename+'.corpus') except FileNotFoundError: #TO-DO: os.path.join takes care of os specific paths try: # Unix filepaths filename, extension = os.path.splitext(os.path.dirname(os.path.realpath(__file__))+'/'+args.csv_file_name) corpus = load_corpus_csv(args.csv_file_name, os.path.dirname(os.path.realpath(__file__))+'/'+args.csv_file_name, delimiter, args.trans_delimiter, annotation_types=None, feature_system_path=os.path.dirname(os.path.realpath(__file__))+'/'+args.feature_file_name) save_binary(corpus, filename+'.corpus') except FileNotFoundError: # Windows filepaths filename, extension = os.path.splitext(os.path.dirname(os.path.realpath(__file__))+'\\'+args.csv_file_name) corpus = load_corpus_csv(args.csv_file_name, os.path.dirname(os.path.realpath(__file__))+'\\'+args.csv_file_name, delimiter, args.trans_delimiter, annotation_types=None, feature_system_path=os.path.dirname(os.path.realpath(__file__))+'\\'+args.feature_file_name) save_binary(corpus, filename+'.corpus')