コード例 #1
0
def convert_text2figer_format(sent, sent_counter, ent_mid):

    new_lined_formatted = ''
    ent_inds = []
    tokens = sent.strip().split(' ')
    token_counter = 0
    first_occr = True
    for token in tokens:
        if has_ent(token, ent_mid) and first_occr == True:
            (mid, ent_tokens, notabletype) = getentparts(token)
            new_lined_formatted += ent_tokens[0] + '\tB-E\n'
            if len(ent_tokens) > 1:
                for i in range(1, len(ent_tokens)):
                    new_lined_formatted += ent_tokens[i] + '\tI-E\n'
            ent_ind = str(sent_counter) + '\t' + str(
                token_counter) + '\t' + mid + '\t' + str(
                    ent_tokens) + '\t' + notabletype
            ent_inds.append(ent_ind)
            first_occr = False
        elif '/m/' in token:
            (mid, ent_tokens, notabletype) = getentparts(token)
            for t in ent_tokens:
                new_lined_formatted += t + '\tO\n'
        else:
            new_lined_formatted += token + '\tO\n'
        token_counter += 1
    return (new_lined_formatted, ent_inds)
コード例 #2
0
def convert_text2figer_format(sent, sent_counter, ent_mid):

    new_lined_formatted = ''
    ent_inds = []
    tokens = sent.strip().split(' ')
    token_counter = 0
    first_occr = True
    for token in tokens: 
        if has_ent(token, ent_mid) and first_occr == True:
            (mid, ent_tokens, notabletype) = getentparts(token)
            new_lined_formatted += ent_tokens[0] + '\tB-E\n'
            if len(ent_tokens) > 1:
                for i in range(1, len(ent_tokens)):
                    new_lined_formatted += ent_tokens[i] + '\tI-E\n'
            ent_ind = str(sent_counter) + '\t' + str(token_counter) + '\t' + mid + '\t' + str(ent_tokens) + '\t' + notabletype
            ent_inds.append(ent_ind)
            first_occr = False
        elif '/m/' in token:
            (mid, ent_tokens, notabletype) = getentparts(token)
            for t in ent_tokens:
                new_lined_formatted += t + '\tO\n'
        else:
            new_lined_formatted += token + '\tO\n'
        token_counter += 1
    return (new_lined_formatted, ent_inds)
コード例 #3
0
def getrawsent(sent):
    new_lined_formatted = ''
    tokens = sent.strip().split(' ')
    token_counter = 0
    for token in tokens:
        if '/m/' in token:
            (mid, ent_tokens, notabletype) = getentparts(token)
            for t in ent_tokens:
                new_lined_formatted += t + ' '
        else:
            new_lined_formatted += token + ' '
        token_counter += 1
    return (new_lined_formatted.strip())
コード例 #4
0
def getrawsent(sent):
    new_lined_formatted = ''
    tokens = sent.strip().split(' ')
    token_counter = 0
    for token in tokens: 
        if '/m/' in token:
            (mid, ent_tokens, notabletype) = getentparts(token)
            for t in ent_tokens:
                new_lined_formatted += t + ' '
        else:
            new_lined_formatted += token + ' '
        token_counter += 1
    return (new_lined_formatted.strip())
コード例 #5
0
def filter_sentences(sampled_lines):
    lines = []
    for l in sampled_lines:
        sent = l.split('\t')[4]
        news = []
        for w in sent.split():
            if '/m/' in w:
                _, tokens, _ = getentparts(w)
                news.append(' '.join(tokens).strip())
            else:
                news.append(w)
        lines.append(' '.join(news).strip())
    return lines
コード例 #6
0
def filter_sentences(sampled_lines):
    lines = []
    for l in sampled_lines:
        sent = l.split('\t')[4]
        news = []
        for w in sent.split():
            if '/m/' in w:
                _, tokens, _ = getentparts(w)
                news.append(' '.join(tokens).strip())
            else:
                news.append(w)
        lines.append(' '.join(news).strip())
    return lines
コード例 #7
0
def fillUsingLines(linespath):
    e2name2freq = defaultdict(dict)
    f = open(linespath)
    for line in f:
        parts = line.split('\t')
        for w in parts[4].split():
            if '/m/' in w:
                (mid, tokens, notabletype) = getentparts(w)
                name = ' '.join(tokens)
                if mid not in e2name2freq:
                    e2name2freq[mid] = defaultdict(lambda: 0)
                e2name2freq[mid][name] += 1
    f.close()
    return e2name2freq
コード例 #8
0
ファイル: __init__.py プロジェクト: yyaghoobzadeh/figment_v2
def fillUsingLines(linespath):
    e2name2freq = defaultdict(dict)
    f = open(linespath)
    for line in f:
        parts = line.split('\t')
        for w in parts[4].split():
            if '/m/' in w:
                (mid, tokens, notabletype) = getentparts(w)
                name = ' '.join(tokens)
                if mid not in e2name2freq:
                    e2name2freq[mid] = defaultdict(lambda: 0)
                e2name2freq[mid][name] += 1
    f.close()
    return e2name2freq