forked from raydot/Terminology
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Extractor.py
642 lines (537 loc) · 22.6 KB
/
Extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
# -*- coding: utf-8 -*-
#####################
#
# © 2012–2015 Autodesk Development Sàrl
#
# Created in 2012 by Petra Ribiczey
#
# Changelog
# v3.6.3 Modified on 12 Jan 2015 by Ventsislav Zhechev
# Fixed a bug where terms, for which a source context could not be found, would still be returned to the client.
# Modified to remove the complete candidate chunk if one of its words consists of a single character.
#
# v3.6.2 Modified on 02 Oct 2014 by Ventsislav Zhechev
# Fixed a bug where the set of original word tokens (new_content_orig_tok) was always empty.
#
# v3.6.1 Modified on 12 Aug 2014 by Ventsislav Zhechev
# Modified to use aliases for staging and production Solr servers.
#
# v3.6 Modified on 21 May 2014 by Ventsislav Zhechev
# Updated to connect to new Solr setup for term lookup.
# Added a few status messages.
# Removed some dead code.
#
# v3.5 Modified on 16 Nov 2013 by Ventsislav Zhechev
# Switched to using a logger provided by the Service module for debug output.
# Fixed a bug with some global counters.
# Removed some unused imports.
#
# v3.4 Modified on 05 Nov 2013 by Ventsislav Zhechev
# Modified to use one ngram list per language to facilitate language-specific terminology extraction.
#
# v3.3 Modified on 04 Nov 2013 by Ventsislav Zhechev
# Put in a MAJOR update to much of the chunk processing code geared towards speed optimisation.
# Delayed the chunk substring lookup to the moment with the least number of chunks to check before sending data off to NeXLT.
#
# v3.2 Modified on 28 Oct 2013 by Ventsislav Zhechev
# Removed the static processing of product names and available languages.
#
# v3.1 Modified on 21 Oct 2013 by Ventsislav Zhechev
# Modified Getterms method to proces a list of segments, rather than a single string containing all data.
# Simplified the generation of the final list of terms.
#
# v3.0.1 Modified on 18 Oct 2013 by Ventsislav Zhechev
# Simplified portions of the source code
#
# v3 Modified on 15 Oct 2013 by Ventsislav Zhechev
# Overhauled the architecture to allow the use as a module from within other Python scripts.
# Removed dead/useless code
# Improved code coherence
#
# v2.3.3 Modified on 29 Aug 2013 by Ventsislav Zhechev
# Moved up the validation of the command line parameters to before the tokeniser training.
# ‘NEW_PRODUCT’ is now an allowed product code option.
# Fixed a bug where the empty string could be extracted as a term candidate.
#
# v2.3.2 Modified on 28 Aug 2013 by Ventsislav Zhechev
# Fixed two index-out-of-bounds bugs.
#
# v2.3.1 Modified on 21 Jun 2013 by Ventsislav Zhechev
# Modified the code to filter substring terms. Now we are keeping the longest term available.
# Commented out all debug output.
#
# v2.3 Modified on 18 Jun 2013 by Ventsislav Zhechev
# Fixed a few UTF-8 related issues.
# Fixed a few minor bugs.
# Switched off substring filtering, as it does not appear to be beneficial.
#
# v2.2 Modified on 17 Jun 2013 by Ventsislav Zhechev
# Fixed a few bugs in chunk processing.
# Reduced the memory usage through using a single temp list for chunks.
# Reduced the number of cycles through the chunks by applying different filters in a single run.
#
# v2.1 Modified on 14 Jun 2013 by Ventsislav Zhechev
# Added more progress output to stderr.
# Combined some processing steps to improve performance.
# Removed some unnecessary processing steps.
# Switched to the brill POS tagger for efficiency—the TnT tagger was too slow.
#
# v2 Modified on 27 May 2013 by Ventsislav Zhechev
# Modified the hard-coded file paths.
# Added command-line parameter handling, so that the script could be called directly, rather launching it from within Python.
# Added extra check for bad POS-tagger output
# Added progress indicator output
#
# v1 Modified by Petra Ribiczey
# Original version.
#
#####################
"""
Prerequisites:
- files are encoded as utf-8
-
"""
import nltk, codecs, locale, requests, json, re, bz2
from nltk.tokenize import LineTokenizer, TreebankWordTokenizer
from nltk.tag import brill, tnt, DefaultTagger
from nltk.corpus import conll2000, treebank
from nltk.corpus.reader import TaggedCorpusReader
from operator import itemgetter
from functools import cmp_to_key
import Service
# Module-level variables
pos_tagger = None
word_tok = sent_tokenizer = None
ngrams = {}
nowords = None
__debug_on__ = False
adskCorpusRoot = adskUnwordsRoot = ngramFilePath = None
def init(settings):
# Service.logger.debug("Initialising terminology extraction system...")
if settings == None:
settings = dict(
adskCorpusRoot="/Volumes/OptiBay/ADSK_Software/termExtraction/auxiliaryData/taggerCorpus",
adskUnwordsRoot="/Volumes/OptiBay/ADSK_Software/termExtraction/auxiliaryData/unwords",
ngramFilePath="/Volumes/OptiBay/ADSK_Software/termExtraction/auxiliaryData/enu.ngrams.nocounts",
)
global adskCorpusRoot, adskUnwordsRoot, ngramFilePath
adskCorpusRoot = settings["adskCorpusRoot"]
adskUnwordsRoot = settings["adskUnwordsRoot"]
ngramFilePath = settings["ngramFilePath"]
# Define tokenizers (these could be changed if needed.)
word_tok = nltk.TreebankWordTokenizer()
sent_tokenizer = nltk.LineTokenizer()
def preplists(filelocation):
if __debug_on__:
Service.logger.debug("\tReading file " + filelocation + "...")
lineSet = set()
for line in codecs.open(filelocation, "r", 'utf-8'):
lineSet.add(line.rstrip().lower())
return lineSet
def loadAuxiliaryData():
# Service.logger.debug("Loading auxiliary data for terminology extraction system...")
global ngramFilePath, adskUnwordsRoot
global ngrams, nowords
# ngrams = codecs.open(ngramFilePath, "r", "utf-8").read()
conn = Service.connectToDB()
cursor = conn.cursor()
cursor.execute("select LangCode3Ltr from TargetLanguages")
langs = cursor.fetchall()
conn.close()
for lang in langs:
if __debug_on__:
Service.logger.debug("\t\tReading nGram file " + ngramFilePath+"."+lang[0].upper()+".bz2...")
ngrams[lang[0]] = bz2.BZ2File(ngramFilePath+"."+lang[0].upper()+".bz2", "r").read()
# Load Autodesk-related lists:
# - ngram-list (from Ventzi, including only the ngrams without counts)
# - NeXLT product names (------ there is an N/A in it???)
# - NeXLT language list
# - city names from http://www.geodatasource.com/ and http://www.maxmind.com/en/worldcities
# - words which should not be harvested (unwords and general words)
# - Autodesk trademarks
# - company names
# Define nowords as filter
nowords = preplists(adskUnwordsRoot+"/general_words.txt").union(preplists(adskUnwordsRoot+"/un_words.txt").union(preplists(adskUnwordsRoot+"/autodesk_trademarks.txt").union(preplists(adskUnwordsRoot+"/company_names.txt").union(preplists(adskUnwordsRoot+"/cities_regions.txt")))))
def trainPOSTagger(useTnTTagger):
global __debug_on__
global pos_tagger
global adskCorpusRoot
# Train TNT/Brill POS-tagger using own training data + treebank data from nltk. Tested that using treebank data improves results.
autodesk = TaggedCorpusReader(adskCorpusRoot, '.*', encoding='utf-8')
train_sents = autodesk.tagged_sents() + treebank.tagged_sents()
# Use TnT tagger on request
if useTnTTagger:
if __debug_on__:
Service.logger.debug("Using TnT POS tagger...")
unk_tagger = DefaultTagger('NN')
pos_tagger = tnt.TnT(unk=unk_tagger, Trained=True)
pos_tagger.train(train_sents)
# Use Brill tagger by default
else:
if __debug_on__:
Service.logger.debug("Using Brill POS tagger...")
def backoff_tagger(tagged_sents, tagger_classes, backoff=None):
if not backoff:
backoff = tagger_classes[0](tagged_sents)
del tagger_classes[0]
for cls in tagger_classes:
tagger = cls(tagged_sents, backoff=backoff)
backoff = tagger
return backoff
word_patterns = [
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
(r'.*ould$', 'MD'),
(r'.*ing$', 'VBG'),
(r'.*ed$', 'VBD'),
(r'.*ness$', 'NN'),
(r'.*ment$', 'NN'),
(r'.*ful$', 'JJ'),
(r'.*ious$', 'JJ'),
(r'.*ble$', 'JJ'),
(r'.*ic$', 'JJ'),
(r'.*ive$', 'JJ'),
(r'.*ic$', 'JJ'),
(r'.*est$', 'JJ'),
(r'^a$', 'PREP'),
]
raubt_tagger = backoff_tagger(train_sents, [nltk.tag.AffixTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger], backoff=nltk.tag.RegexpTagger(word_patterns))
templates = [
brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)),
brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)),
brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)),
brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)),
brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)),
brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)),
brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)),
brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)),
brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)),
brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1))
]
trainer = brill.FastBrillTaggerTrainer(raubt_tagger, templates)
pos_tagger = trainer.train(train_sents, max_rules=200, min_score=3)
# Define harvesting process
# 'content' is the text to harvest terms from
# 'lang' is the target language
# 'prods' is a list of product codes on NeXLT
# example for a string that is in an Inventor product: Getterms("Define user-specific information %d", 'jpn', 'INV'])
def Getterms(content, lang, prods, returnJSON):
global __debug_on__, bad_stemmer_1, bad_stemmer_3, bad_stemmer_4
bad_stemmer_1 = bad_stemmer_3 = bad_stemmer_4 = 0
Service.logger.debug("Started processing for %d segments for language %s" % (len(content), lang))
new_content = set()
new_content_orig = ""
new_content_orig_tok = set()
for seg in set(content):
seg = seg.replace('\\r','\r') # Treating UI strings containing \r escapes
seg = seg.replace('\\n','\n') # Treating UI strings containing \n escapes
seg = seg.replace('\r\n','\n')# Collapsing new lines
seg = seg.replace('\n ','\n') # Clean-up the line endings—not sure if useful at all
new_content_orig += " " + seg
for word in word_tok.tokenize(seg):
new_content_orig_tok.add(word)
seg = seg.replace('%','')
seg = seg.replace(".. ."," ...")
seg = seg.replace('<openparen>','(')
seg = seg.replace('<closeparen>',')')
seg = seg.replace("'","'")
seg = seg.replace('"', '"')
seg = seg.replace('&', '&')
seg = seg.replace('<', '<')
seg = seg.replace('>', '>')
# Do the following even occur in our data?
# new_content = new_content.replace('ˆ', '^')
# new_content = new_content.replace('˜', '~')
# new_content = new_content.replace('–', '–')
# new_content = new_content.replace('—', '—')
# new_content = new_content.replace('‘', '‘')
# new_content = new_content.replace('’', '’')
# new_content = new_content.replace('‚', ',')
# new_content = new_content.replace('“', '“')
# new_content = new_content.replace('”', '”')
# new_content = new_content.replace('„', '"')
# new_content = new_content.replace('‰', '‰')
# new_content = new_content.replace('€', '€')
seg_htmlcleaned = nltk.clean_html(seg)
seg = seg_htmlcleaned.replace('\\t','\n')
# Some very crude pre-tokenisation
seg = seg.replace(':',' :')
seg = seg.replace('\t','\n')
seg = seg.replace("\\", '\n')
seg = seg.replace('|','\n')
# It’s not quite clear what this is supposed to do
seg = seg.replace("&","")
seg = seg.replace("[\\w]+_[\\w]+","\n")
seg = seg.replace("[\\w]+_","\n")
seg = seg.replace("_[\\w]+","\n")
# Service.logger.debug("Segment: " + seg)
for seg in sent_tokenizer.tokenize(seg):
new_content.add(seg)
if __debug_on__:
# Service.logger.debug("Finished sentence segmentation.")
Service.logger.debug("Finished character-level pre-processing.")
# word tokenize the sentences
new_content_tokenized = [word_tok.tokenize(line) for line in new_content]
if __debug_on__:
Service.logger.debug("Finished main tokenisation.")
# Remove empty lines
# Remove one-word lines which are all caps. Typically: commands.
for l in new_content_tokenized:
if len(l)==0 or (len(l)==1 and str(l).isupper()):
new_content_tokenized.remove(l)
else:
# Remove placeholders (note: any modification can be made to the text here, as the final output will be verified in the original content)
# (this doesn’t really make any sense to me —— V.)
for t in l:
if '%' in t:
l.remove(t)
# POS tag sentences
tagged_sent = pos_tagger.batch_tag(new_content_tokenized)
if __debug_on__:
Service.logger.debug("Finished main POS tagging.")
# [Issue not repro since default tagger is added] I leave it in, in any case. None tags are extremely rare, deleting these segments results in minimal loss.
tagged_sent = [r for r in tagged_sent if ("', None), ('" not in str(r)) and ("', None" not in str(r)) and ("', ''" not in str(r))]
# Define chunkers (left in 'Unk'/'UNK' as POS-tags for unknown words in the chunker definition, but the tagger uses 'NN'.)
def GetSurfaceChunksByStem(sentences):
global bad_stemmer_1
grammar = (r'''CHUNK: {(<Unk|UNK|NN.*|VBN>*)(<JJ.*|VBN>*)(<Unk|UNK|NN.*|VBN>)(<Unk|UNK|NN.*>+)}''')
cp = nltk.RegexpParser(grammar)
chunks = set()
for sent in sentences:
try:
tree = cp.parse(sent)
for subtree in tree.subtrees():
if subtree.node == 'CHUNK':
chunks.add(' '.join([l[0] for l in subtree.leaves()]))
except:
bad_stemmer_1 += 1
# Service.logger.debug("1+")
# Service.logger.debug("Bad stemmer 1: "+str(sent))
return chunks
# This chunker extracts units like "Elements limiting slenderness"
def GetSurfaceChunksByStem3(sentences):
global bad_stemmer_3
grammar = (r'''CHUNK: {<Unk|UNK|NN.*|VBN> <VBG> <Unk|UNK|NN.*>}''')
cp = nltk.RegexpParser(grammar)
chunks = set()
for sent in sentences:
try:
tree = cp.parse(sent)
for subtree in tree.subtrees():
if subtree.node == 'CHUNK':
chunks.add(' '.join([l[0] for l in subtree.leaves()]))
except:
bad_stemmer_3 += 1
# Service.logger.debug("3+")
return chunks
# Chunker for single word noun-like units
def GetNouns(sentences):
global bad_stemmer_4
grammar = (r'''CHUNK: {<Unk|UNK|NN.*|VBN|JJ.*>}''')
cp = nltk.RegexpParser(grammar)
chunks = set()
for sent in sentences:
try:
tree = cp.parse(sent)
for subtree in tree.subtrees():
if subtree.node == 'CHUNK':
chunks.add(' '.join([l[0] for l in subtree.leaves()]))
except:
bad_stemmer_4 += 1
# Service.logger.debug("4+")
return chunks
# Get compound chunks extracted
# AND Remove duplicate chunks
new_chunks = GetSurfaceChunksByStem(tagged_sent).union(GetSurfaceChunksByStem3(tagged_sent))
if __debug_on__:
Service.logger.debug("Finished main chunking.")
Service.logger.debug((u"Skipped bad parses as follows: 1=" + str(bad_stemmer_1) + u" 3=" + str(bad_stemmer_3) + u" 4=" + str(bad_stemmer_4)).encode('utf-8'))
# Correct chunks (Some corrections aren't repro, because they were added for a different tokenizer.
# They don't hurt to have - I leave them in.)
# Maybe these characters should be removed from the beginning...?
not_needed = ['.', '^', "'", "\\", "/", "!", '_', '%', "=", '*', '>', '<', '\\', ":", "|"]
tempSet = set()
for w in new_chunks:
# [Issue not repro.] Remove '@' from multi-word units.
w = w.replace('@', '')
# [Issue not repro.] Remove '*' from the multi-word units.
w = w.replace('*', '')
# [Issue not repro.] Remove '.' from the end of multi-word units.
# [Issue not repro.] Remove ',' from the end of multi-word units.
w.rstrip(".,")
# Correct issue deriving from tokenization
w = w.replace(" 's", "'s")
# Get rid of words containing '+' in chunks (for sw strings).
# Eg: 'Ctrl+A key combination' will become 'key combination'
if '+' in w:
tok = word_tok.tokenize(w)
for i in tok:
if "+" in i:
w = w.replace(i, '')
# [Issue not repro.] Remove '=' from multi-word units.
w = w.replace('=', '')
# [Issue not repro.] Remove double spaces from multi-word units.
w = w.replace(' ', ' ')
# [Issue not repro.] Remove space from the end of multi-word units.
# [Issue not repro.] Remove space from the beginning of multi-word units.
w.strip()
# remove one letter words from the chunk units (eg. remains of placeholders)
# switched to removing the whole chunmk if a one-letter word was found
noWordFound = False
for word in word_tok.tokenize(w):
if word in nowords or len(word) == 1:
noWordFound = True
break
if not noWordFound:
for mark in not_needed:
if mark in w:
noWordFound = True
break
if not noWordFound:
tempSet.add(w)
new_chunks = list(tempSet)
if __debug_on__:
Service.logger.debug("Finished first chunk cleanup.")
# check if extracted multi-word units are in the ngram list.
compounds_new_to_the_ngram_set = set()
counter = 0
for wrd in new_chunks:
if __debug_on__:
counter += 1
if not counter % 100:
Service.logger.debug(".")
if not counter % 5000:
Service.logger.debug(str(counter))
if re.search(" " + wrd + " ", ngrams[lang]) == None:
compounds_new_to_the_ngram_set.add(wrd)
if __debug_on__:
Service.logger.debug("Finished n-gram list lookup.")
# extract noun(like) units
nouns = [w for w in GetNouns(tagged_sent) if w.isdigit() == False]
# clean results up from (untranslatable) characters, nowords content and check if they are in the original content as-is
new_nouns = set()
for n in nouns:
not_needed_found = False
for mark in not_needed:
if mark in n:
not_needed_found = True
break
if not not_needed_found:
new_nouns.add(n)
new_nouns = [w.lower() for w in new_nouns if (w.lower() not in nowords) and (w in new_content_orig_tok)]
counter = 0
# check whether nouns exist in the ngram list
nouns_new_to_tm_ngram_set = set()
for wrd in new_nouns:
if __debug_on__:
counter += 1
if not counter % 100:
Service.logger.debug(".")
if not counter % 5000:
Service.logger.debug(str(counter))
if re.search(" " + wrd + " ", ngrams[lang]) == None:
nouns_new_to_tm_ngram_set.add(wrd)
if __debug_on__:
Service.logger.debug("Finished noun selection.")
# Compounds: compounds_new_to_the_ngram_set
# Single words: nouns_new_to_tm_ngram_set
# Create one group of all chunks
# check back if extracted term candidates are in the original text as well
new_words_and_compounds = [w for w in compounds_new_to_the_ngram_set.union(nouns_new_to_tm_ngram_set) if w in new_content_orig]
if __debug_on__:
Service.logger.debug("Starting substring cleanup for " + str(len(new_words_and_compounds)) + " chunks")
# remove multi-word chunks that are compouns of smaller multi-word chunks. For example, 'calculation configuration' and
# 'dialog box' remains, but 'calculation configuration dialog box' will be removed
tempSet = set()
new_chunks_set = set([_.lower() for _ in new_words_and_compounds])
new_chunks_temp = sorted(new_chunks_set, key=cmp_to_key(locale.strcoll))
counter = 0
for i in range(0, len(new_chunks_temp)):
for j in range(i, len(new_chunks_temp)):
if __debug_on__:
counter += 1
if not counter % 10000:
Service.logger.debug(".")
if not counter % 500000:
Service.logger.debug(str(counter))
nc = new_chunks_temp[i] + ' ' + new_chunks_temp[j]
if nc in new_chunks_set:
# Service.logger.debug("found superstring " + nc)
tempSet.add(nc)
nc = new_chunks_temp[j] + ' ' + new_chunks_temp[i]
if nc in new_chunks_set:
# Service.logger.debug("found superstring " + nc)
tempSet.add(nc)
# Word tokenize filtered multi-word units.
new_words_and_compounds = [w for w in new_chunks_temp if w not in tempSet]
if __debug_on__:
Service.logger.debug("Finished chunk substring cleanup.")
# Query NeXLT for existing translation
if __debug_on__:
Service.logger.debug("Running NeXLT queries for " + str(len(new_words_and_compounds)) + " chunks...")
def QueryNeXLT(term, language, prod_name):
r = None
if Service.isStaging:
r = requests.get("http://aws.stg.solr:8983/solr/select/?wt=json&start=0&rows=1&q=enu%3A%22" + term + "%22%20AND%20product:" + prod_name + "%20AND%20" + language + ":['' TO *]")
else:
r = requests.get("http://aws.prd.solr:8983/solr/select/?wt=json&start=0&rows=1&q=enu%3A%22" + term + "%22%20AND%20product:" + prod_name + "%20AND%20" + language + ":['' TO *]")
r.encoding = "utf-8"
try:
response = r.json()['response']['numFound']
except:
response = 0
return response
def QueryNeXLTAllProds(term, language):
r = None
if Service.isStaging:
r = requests.get("http://aws.stg.solr:8983/solr/select/?wt=json&start=0&rows=1&q=enu%3A%22" + term + "%22%20AND%20product:" + '*' + "%20AND%20" + language + ":['' TO *]")
else:
r = requests.get("http://aws.prd.solr:8983/solr/select/?wt=json&start=0&rows=1&q=enu%3A%22" + term + "%22%20AND%20product:" + '*' + "%20AND%20" + language + ":['' TO *]")
r.encoding = "utf-8"
try:
response = r.json()['response']['numFound']
except:
response = 0
return response
new_words_and_compounds_in_product = []
for t in new_words_and_compounds:
newTerm = True
for prod_name in prods:
newTerm = newTerm and QueryNeXLT(t.lower(), lang, prod_name) == 0
if newTerm:
new_words_and_compounds_in_product.append(t)
# product independent query to NeXLT
# append context and product/corpus information + number of occurrences
terms = []
for term in new_words_and_compounds_in_product:
contexts = [con for con in new_content if term.lower() in con.lower()]
# Skip any terms that cannot be found in the original source. We cannot provide terms with no context to translators.
if len(contexts) == 0:
continue
Service.logger.error(u"Could not find original context for term %s!" % term)
if QueryNeXLTAllProds(term.lower(), lang) == 0:
terms.append([term, "Corpus", contexts, len(contexts), len(term)])
else:
terms.append([term, "Product", contexts, len(contexts), len(term)])
if __debug_on__:
Service.logger.debug("Finished NeXLT calls with %s new terms remaining." % len(terms))
if returnJSON:
# Sort final term list and create json format
terms = sorted(terms, key=itemgetter(1,0), reverse=True)
terms_for_json = {}
for listitem in terms:
if prods[0] == "NEW_PRODUCT":
k = {listitem[0]: {'newto': "New product, search in corpus only", 'context': listitem[2], 'numContextSents': listitem[3]}}
else:
k = {listitem[0]: {'newto': listitem[1], 'context': listitem[2], 'numContextSents': listitem[3]}}
terms_for_json.update(k)
if __debug_on__:
Service.logger.debug("Finished final processing.")
return terms_for_json
else:
if __debug_on__:
Service.logger.debug("Finished final processing.")
Service.logger.debug("Extracted terms:")
# import pprint
# for term in terms:
# Service.logger.debug("\tterm: %s" % pprint.saferepr(term))
return terms