def load(collada, localscope, xmlnode): if 'surfaceproperty' not in localscope: localscope['surfaceproperty'] = {} opticalsurface = [] for elem in xmlnode.findall(tag("opticalsurface")): surf = OpticalSurface.load(collada, localscope, elem) localscope['surfaceproperty'][surf.name] = surf opticalsurface.append(surf) #log.debug("loaded %s opticalsurface " % len(opticalsurface)) skinmap = {} skinsurface = [] for elem in xmlnode.findall(tag("skinsurface")): skin = SkinSurface.load(collada, localscope, elem) skinsurface.append(skin) if skin.volumeref not in skinmap: skinmap[skin.volumeref] = [] pass skinmap[skin.volumeref].append(skin) log.debug("loaded %s skinsurface " % len(skinsurface)) bordermap = {} bordersurface = [] for elem in xmlnode.findall(tag("bordersurface")): bord = BorderSurface.load(collada, localscope, elem) bordersurface.append(bord) if bord.physvolref1 not in bordermap: bordermap[bord.physvolref1] = [] bordermap[bord.physvolref1].append(bord) if bord.physvolref2 not in bordermap: bordermap[bord.physvolref2] = [] bordermap[bord.physvolref2].append(bord) log.debug("loaded %s bordersurface " % len(bordersurface)) pass return DAEExtra(opticalsurface, skinsurface, bordersurface, skinmap, bordermap, xmlnode)
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) inp = codecs.open(os.path.join(p['base_path'], p['corpora_path'], p['corpus_name']), mode='r', encoding='utf-8') out = codecs.open(os.path.join(output_dir, p['result_name']), mode='w', encoding='utf-8') pair = re.compile('\d\.(\w+):(\w+)') exclude = set(string.punctuation) line_count = 0 res = [] for line in inp: # skip empty lines if line == "\n": continue # finished one entry if line_count % 5 == 0: print pair.search(line).groups() res.append({'terms': pair.search(line).groups(), 'sentences': [], 'sentences_tagged': [], 'values': []}) # annotate sentence and add it to result if line_count % 5 == 1 or line_count % 5 == 2: res[-1]['sentences'].append(line.strip()) cleaned = "".join(ch for ch in line.strip() if ch not in exclude) tagged = tools.tag(cleaned, p['senna_path']) res[-1]['sentences_tagged'].append(tagged) # add the ratings if line_count % 5 == 3 or line_count % 5 == 4: res[-1]['values'].append(float(line)) line_count = line_count+1 # store the output json.dump(res, out, indent=2)
def create_ex(text, pos='n', last_index=False, fast=False): # liste mit liste von token # sentence und word tokenizing sents = tools.tokenize(text) # POS tagging tagged = tools.tag(sents) # [lemma, count [[token (original), POS tag, lemma, index des satzes im text, index des tokens im satz], ...] ] if pos == "n": words = tools.get_nouns(tagged) elif pos == "v": words = tools.get_verbs(tagged) elif pos == 'a': words = tools.get_adj(tagged) else: print("Fehler: Unbekannter POS Tag!") return lemmas_in_order_of_frequency = [k[0] for k in words] print len(lemmas_in_order_of_frequency) # no of sentences available sent_count = len(tagged) sents_with_cloze = [[] for x in xrange(sent_count)] # regex, die mit look-ahead nach Whitespace vor bestimmten Satzzeichen sucht r = re.compile(r'\s(?=,|\.|!|;|"|\'|\))') for i in range(sent_count): # satz vorbereiten s = sents[i] # welche lemmata kommen in diesem satz vor? lemmas_in_s = [] for n in words: for k in n[2]: if k[3]==i: lemmas_in_s.append([n[1]] + k) # [[10, u'prince', 'NN', u'prince', 1, 3], [4, u'flowers', 'NNS', u'flower', 1, 7], ...] # wenn aus dem satz keine cloze question gebildet werden kann: if lemmas_in_s == []: # leerzeichen vor , . ! usw. entfernen, dass durch " ".join() dazugekommen ist s = re.sub(r, "", " ".join(s)) sents_with_cloze[i].append(s) continue elif len(lemmas_in_s) == 1: chosen = lemmas_in_s[0] else: # schnellere Alternative: token moeglichst weit hinten im satz, da laut literatur bessere multiple-choice frage if last_index: #print lemmas_in_s last_index = max([k[5] for k in lemmas_in_s]) chosen_l = [k for k in lemmas_in_s if k[5] == last_index] chosen = chosen_l[0] #print chosen else: # haeufige lemmata bevorzugen damit sie besser gelernt werden # index in lemmas_in_s and frequency of lemma in text indexes_and_counts = dict((k, lemmas_in_s[k][0]) for k in range(len(lemmas_in_s))) chosen_index = tools.simple_prob_dist(indexes_and_counts) chosen = lemmas_in_s[chosen_index] token = chosen[1] token_index = chosen[5] # WESENTLICH schnellere variante (parameter fast=true) : nur lemmata aus dem text verwenden, erspart zugriffe auf wordnet dis = tools.get_dis(chosen, pos, lemmas_in_order_of_frequency, fast=fast) # Wenn keine gueltigen Distraktoren gefunden werden koennen, Satz auslassen if dis == None: s = re.sub(r, "", " ".join(s)) sents_with_cloze[i].append(s) continue # Distraktoren grammatikalisch anpassen dis = tools.adapt_dis(chosen, pos, dis) # a / an angleichen um keine unnoetigen hinweise zu geben if pos == 'n': if s[token_index -1] == 'a' or s[token_index -1] == 'an': s[token_index -1] = "" token = en.noun.article(token) dis = [en.noun.article(d) for d in dis] wordsbefore = " ".join(s[:token_index]) if token_index < len(s) - 1: wordsafter = " ".join(s[(token_index+1):]) else: wordsafter = "" # leerzeichen vor , . ! usw. entfernen, dass durch " ".join() dazugekommen ist wordsbefore = re.sub(r, "", wordsbefore) wordsafter = re.sub(r, "", wordsafter) cloze = [wordsbefore, wordsafter, token, dis] sents_with_cloze[i] = cloze # [[wordsbefore, wordsafter, (unicode strings), token (unicode string), [als unicode distractors]], ...] sents_with_cloze = tools.sanitize_sents(sents_with_cloze) return sents_with_cloze
def test_sentence_with_loc(): """tag a sentence that contains a location""" tagged = tools.tag('To drink alcohol is very good for you in Berlin', senna_path) assert 'base' in tagged[1] assert tagged[1]['base'] == 'drink' assert tagged[-1]['ner'] == "S-LOC"
def test_short_sentence(): """tag a simple and short sentence""" tagged = tools.tag('Alcohol is very good for you', senna_path) assert tagged[0]['term'] == 'Alcohol' assert not 'base' in tagged[0] assert [len(tag) for tag in tagged] == [3, 3, 3, 3, 3, 3]