コード例 #1
0
ファイル: daeextra.py プロジェクト: NuTufts/ChromaUBooNE
    def load(collada, localscope, xmlnode):
        if 'surfaceproperty' not in localscope:
            localscope['surfaceproperty'] = {}

        opticalsurface = []
        for elem in xmlnode.findall(tag("opticalsurface")):
            surf = OpticalSurface.load(collada, localscope, elem)
            localscope['surfaceproperty'][surf.name] = surf
            opticalsurface.append(surf)
        #log.debug("loaded %s opticalsurface " % len(opticalsurface))

        skinmap = {}
        skinsurface = []
        for elem in xmlnode.findall(tag("skinsurface")):
            skin = SkinSurface.load(collada, localscope, elem)
            skinsurface.append(skin)

            if skin.volumeref not in skinmap:
                skinmap[skin.volumeref] = []
            pass
            skinmap[skin.volumeref].append(skin)

        log.debug("loaded %s skinsurface " % len(skinsurface))

        bordermap = {}
        bordersurface = []
        for elem in xmlnode.findall(tag("bordersurface")):
            bord = BorderSurface.load(collada, localscope, elem)
            bordersurface.append(bord)

            if bord.physvolref1 not in bordermap:
                bordermap[bord.physvolref1] = []
            bordermap[bord.physvolref1].append(bord)

            if bord.physvolref2 not in bordermap:
                bordermap[bord.physvolref2] = []
            bordermap[bord.physvolref2].append(bord)

        log.debug("loaded %s bordersurface " % len(bordersurface))

        pass
        return DAEExtra(opticalsurface, skinsurface, bordersurface, skinmap,
                        bordermap, xmlnode)
コード例 #2
0
ファイル: oshea2json_run.py プロジェクト: quesada/runs-gensim
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    inp = codecs.open(os.path.join(p['base_path'],
                                   p['corpora_path'],
                                   p['corpus_name']),
                      mode='r', encoding='utf-8')
    out = codecs.open(os.path.join(output_dir,
                                   p['result_name']),
                      mode='w', encoding='utf-8')
    pair = re.compile('\d\.(\w+):(\w+)')
    exclude = set(string.punctuation)

    line_count = 0
    res = []

    for line in inp:
        
        # skip empty lines
        if line == "\n":
            continue
        
        # finished one entry
        if line_count % 5 == 0:
            print pair.search(line).groups()
            res.append({'terms': pair.search(line).groups(),
                        'sentences': [],
                        'sentences_tagged': [],
                        'values': []})

        # annotate sentence and add it to result
        if line_count % 5 == 1 or line_count % 5 == 2:
            res[-1]['sentences'].append(line.strip())
            cleaned = "".join(ch for ch in line.strip() if ch not in exclude)
            tagged = tools.tag(cleaned, p['senna_path'])
            res[-1]['sentences_tagged'].append(tagged)

        # add the ratings
        if line_count % 5 == 3 or line_count % 5 == 4:
            res[-1]['values'].append(float(line))

        line_count = line_count+1
    
    # store the output
    json.dump(res, out, indent=2)
コード例 #3
0
ファイル: wordex.py プロジェクト: SuzanaK/wordgap
def create_ex(text, pos='n', last_index=False, fast=False):

    # liste mit liste von token 
    # sentence und word tokenizing 
    sents = tools.tokenize(text)
    # POS tagging 
    tagged = tools.tag(sents)

    # [lemma, count [[token (original), POS tag, lemma, index des satzes im text, index des tokens im satz], ...] ]
    if pos == "n":
        words = tools.get_nouns(tagged)
    elif pos == "v":
        words = tools.get_verbs(tagged)
    elif pos == 'a':
        words = tools.get_adj(tagged)
    else:
        print("Fehler: Unbekannter POS Tag!")
        return
    lemmas_in_order_of_frequency = [k[0] for k in words]
    print len(lemmas_in_order_of_frequency)
    # no of sentences available
    sent_count = len(tagged)
    sents_with_cloze = [[] for x in xrange(sent_count)]
    # regex, die mit look-ahead nach Whitespace vor bestimmten Satzzeichen sucht 
    r = re.compile(r'\s(?=,|\.|!|;|"|\'|\))')

    for i in range(sent_count):
        # satz vorbereiten 
        s = sents[i]
        
        # welche lemmata kommen in diesem satz vor? 
        lemmas_in_s = []
        for n in words:
            for k in n[2]:
                if k[3]==i:
                    lemmas_in_s.append([n[1]] + k)
        
        # [[10, u'prince', 'NN', u'prince', 1, 3], [4, u'flowers', 'NNS', u'flower', 1, 7], ...]
        # wenn aus dem satz keine cloze question gebildet werden kann: 
        if lemmas_in_s == []:
            # leerzeichen vor , . ! usw. entfernen, dass durch " ".join() dazugekommen ist 
            s = re.sub(r, "", " ".join(s))
            sents_with_cloze[i].append(s)
            continue 

        elif len(lemmas_in_s) == 1:
            chosen = lemmas_in_s[0]
            
        else:
            # schnellere Alternative: token moeglichst weit hinten im satz, da laut literatur bessere multiple-choice frage 
            if last_index:
                #print lemmas_in_s 
                last_index = max([k[5] for k in lemmas_in_s])
                chosen_l = [k for k in lemmas_in_s if k[5] == last_index]
                chosen = chosen_l[0]
                #print chosen 
            else:
            # haeufige lemmata bevorzugen damit sie besser gelernt werden 
            # index in lemmas_in_s and frequency of lemma in text 
                indexes_and_counts = dict((k, lemmas_in_s[k][0]) for k in range(len(lemmas_in_s)))
                chosen_index = tools.simple_prob_dist(indexes_and_counts)
                chosen = lemmas_in_s[chosen_index]
                
        token = chosen[1]    
        token_index = chosen[5]

        # WESENTLICH schnellere variante (parameter fast=true) : nur lemmata aus dem text verwenden, erspart zugriffe auf wordnet 
        dis = tools.get_dis(chosen, pos, lemmas_in_order_of_frequency, fast=fast)

        # Wenn keine gueltigen Distraktoren gefunden werden koennen, Satz auslassen 
        if dis == None:
            s = re.sub(r, "", " ".join(s))
            sents_with_cloze[i].append(s)
            continue
        # Distraktoren grammatikalisch anpassen 
        dis = tools.adapt_dis(chosen, pos, dis)


        # a / an angleichen um keine unnoetigen hinweise zu geben 
        if pos == 'n':
            if s[token_index -1] == 'a' or s[token_index -1] == 'an':
                s[token_index -1] = ""
                token = en.noun.article(token)
                dis = [en.noun.article(d) for d in dis]

        wordsbefore = " ".join(s[:token_index])
        if token_index < len(s) - 1:
            wordsafter = " ".join(s[(token_index+1):])
        else:
            wordsafter = ""
        # leerzeichen vor , . ! usw. entfernen, dass durch " ".join() dazugekommen ist 
        wordsbefore = re.sub(r, "", wordsbefore)
        wordsafter = re.sub(r, "", wordsafter)
        cloze = [wordsbefore, wordsafter, token, dis]
        sents_with_cloze[i] = cloze



    # [[wordsbefore, wordsafter, (unicode strings), token (unicode string), [als unicode distractors]], ...]
    
    sents_with_cloze = tools.sanitize_sents(sents_with_cloze)

    return sents_with_cloze
コード例 #4
0
ファイル: test_tagger.py プロジェクト: dedan/runs-gensim
def test_sentence_with_loc():
    """tag a sentence that contains a location"""
    tagged = tools.tag('To drink alcohol is very good for you in Berlin', senna_path)
    assert 'base' in tagged[1]
    assert tagged[1]['base'] == 'drink'
    assert tagged[-1]['ner'] == "S-LOC"
コード例 #5
0
ファイル: test_tagger.py プロジェクト: dedan/runs-gensim
def test_short_sentence():
    """tag a simple and short sentence"""
    tagged = tools.tag('Alcohol is very good for you', senna_path)
    assert tagged[0]['term'] == 'Alcohol'
    assert not 'base' in tagged[0]
    assert [len(tag) for tag in tagged] == [3, 3, 3, 3, 3, 3]