def main(): results = [] sents = open('../d2_data/all_sents_spellchecked.txt', 'r').read().split('\n') caller = pipeline_caller.PipelineCaller() tool_name = "morphanalyzer" api_token = "sQj6zxcVt7JzWXHNTdRu3QRzc6i8KZz7" start_i = 346 i = 0 for s in sents[start_i:]: try: print(s) curr_result = [] for w in s.split(): r = caller.call(tool_name, w, api_token) r = ' '.join(r.split('\n')) curr_result.append(r) curr_result = '\n'.join(curr_result) results.append( '<S> <S>+BSTag\n{0}\n</S> </S>+ESTag'.format(curr_result)) i += 1 print(curr_result) except ConnectionResetError: True if i % 1000 == 0 or i == len(sents[start_i:]): with open('parsed_sents_{0}.txt'.format(i), 'w') as f: f.write('\n'.join(results))
def main(): caller = pipeline_caller.PipelineCaller() tool_name = "spellcheck" api_token = "sQj6zxcVt7JzWXHNTdRu3QRzc6i8KZz7" result = '' text = open('../d2_data/all_verbs.txt', 'r').read() result += caller.call(tool_name, text, api_token) with open('../d2_data/all_verbs_spellchecked.txt', 'w') as f: f.write(result)
def module_pipelineNoisy_whole_test(self): try: caller = pipeline_caller.PipelineCaller( 'pipelineNoisy', KATANA, os.environ['pipeline_token'], 'whole') r = re.compile(r'(\d+)(\t.+?){7,}', re.MULTILINE) response = caller.call() print(response) assert len(re.findall(r, response)) == 33 except: self.fail('Exception thrown')
def module_Vowelizer_word_test(self): try: caller = pipeline_caller.PipelineCaller( 'Vowelizer', KELIME, os.environ['pipeline_token'], 'word') r = re.compile(r'(.+?\n)', re.MULTILINE) response = caller.call() print(response) assert len(re.findall(r, response)) == 4 except: self.fail('Exception thrown')
def module_pipelineNoisy_sentence_test(self): try: caller = pipeline_caller.PipelineCaller( 'pipelineNoisy', UCDORT, os.environ['pipeline_token'], 'sentence') r1 = re.compile(r'(1)(\t.+?){7,}', re.MULTILINE) r2 = re.compile(r'(5)(\t.+?){7,}', re.MULTILINE) response = caller.call() print(response) assert len(re.findall(r1, response)) == 2 and len( re.findall(r2, response)) == 1 except: self.fail('Exception thrown')
def main(argv): table_name = '' token = 'LQiWv0FTmQEJRVbun8Rqld6WZCIrGUyO' tool = 'normalize' try: opts, args = getopt.getopt(argv, "ut:", ["table="]) except getopt.GetoptError: print('normalization.py -t <table_name>') sys.exit(2) for opt, arg in opts: if opt == '-u': print('normalization.py -t <table_name>') sys.exit() elif opt in ("-t", "--table"): table_name = arg db = PathyDB() table_name=table_name.lower() table_name=table_name.replace('ç','c') table_name=table_name.replace('Ç','c') table_name=table_name.replace('ü','u') table_name=table_name.replace('Ü','u') table_name=table_name.replace('Ö','o') table_name=table_name.replace('Ğ','g') table_name=table_name.replace('ü','u') table_name=table_name.replace('ğ','g') table_name=table_name.replace('Ş','s') table_name=table_name.replace('ş','s') table_name=table_name.replace('ı','i') table_name=table_name.replace('İ','i') tweets = db.get_all_not_normalized_tweets(table_name) for tweet in tweets: caller = pipeline_caller.PipelineCaller(tool, unidecode(tweet['tweet_text']), token) normalized_text = caller.call() print(normalized_text) tweet['normalized_text'] = normalized_text db.update_tweet(table_name, tweet)
def main(): caller = pipeline_caller.PipelineCaller() tool_name = "spellcheck" api_token = "sQj6zxcVt7JzWXHNTdRu3QRzc6i8KZz7" result = '' data = open('../d2_data/query_results_all_joined_sents.csv') reader = csv.reader(data) sents = [] indices =[] for r in reader: sents.append(r[0]) indices.append(r[2]) with open('../d2_data/target_indices.txt', 'w') as f: f.write('\n'.join(indices)) for i in range(0, 8): text = '\n'.join(sents[i*10000:(i+1)*10000]) result += caller.call(tool_name, text, api_token) with open('../d2_data/all_sents_spellchecked.txt', 'w') as f: f.write(result)
""" Send list of word windows to ITU for spellchecking Heikal Badrulhisham <*****@*****.**>, 2019 """ import pipeline_caller import csv from collections import defaultdict import pickle import os # For calling ITU pipeline caller = pipeline_caller.PipelineCaller() tool_name = "spellcheck" api_token = "sQj6zxcVt7JzWXHNTdRu3QRzc6i8KZz7" # Dictionary of past spellcheck results if os.path.isfile('spellcheck_history.pkl'): spellcheck_history = pickle.load(open('spellcheck_history.pkl', 'rb')) else: spellcheck_history = defaultdict(str) def spellcheck(word): if word in spellcheck_history: return spellcheck_history[word] else: sc = caller.call(tool_name, word, api_token).replace('\r\n', '') spellcheck_history[word] = sc return sc
def nlpPipeline(text): caller = pipeline_caller.PipelineCaller('normalize', text, '') result = caller.call( ) # call function takes tool name, text, and API access token return result
def module_exception_test(self): try: caller = pipeline_caller.PipelineCaller() caller.call("pipelineNoisy", "test sentence", "random token") except: self.fail("Exception thrown")
def nlpPipeline(text): text = text.encode('utf8') caller = pipeline_caller.PipelineCaller( 'normalize', text, 'MKHVuqqLiARKHNFq7eEOuOJr54Mncxir', 'whole') result = caller.call() return result