コード例 #1
0
def generate_ngrams(line):
    result = []
    line = line.strip()
    for sentence in line_filter(' '.join(default_tokenize_func(line))):
        tokens_plain = []
        sentence = sentence.split()
        i = 0
        while i < len(sentence):
            for j in range(min(len(sentence), i + 20), i, -1):
                token = ' '.join(sentence[i:j])
                if i + 1 == j and i == 0:
                    # if first word in sentence -> do not attempt to link, could be wrong (Apple)
                    tokens_plain.append(token.lower())
                elif token in unambiguous_labels:
                    # TODO: check it doesn't span titles
                    uri = unambiguous_labels[token]
                    # get types
                    tokens_plain.append('<dbpedia:' + uri + '>')
                    i = j - 1
                    break
            i += 1
        for n in range(1, N + 1):
            for ngram in nltk.ngrams(tokens_plain, n):
                result.append((' '.join(ngram), 1))
    return result
コード例 #2
0
def generate_ngrams(line):
    result = []
    line = line.strip()
    for sentence in line_filter(' '.join(default_tokenize_func(line))):
        tokens_plain = []
        sentence = sentence.split()
        i = 0
        while i < len(sentence):
            for j in range(min(len(sentence), i+20), i, -1):
                token = ' '.join(sentence[i:j])
                if i+1 == j and i == 0:
                    # if first word in sentence -> do not attempt to link, could be wrong (Apple)
                    tokens_plain.append(token.lower())
                elif token in unambiguous_labels:
                    # TODO: check it doesn't span titles
                    uri = unambiguous_labels[token]
                    # get types
                    tokens_plain.append('<dbpedia:'+uri+'>')
                    i = j-1
                    break
            i += 1
        for n in range(1, N+1):
            for ngram in nltk.ngrams(tokens_plain, n):
                result.append((' '.join(ngram), 1))
    return result
コード例 #3
0
def unpack_achors(line):
    label, uri_list = line.split('\t')
    # tokenize for commas
    label = ' '.join(tokenize_possessive(default_tokenize_func(label)))
    # should be only one
    uri_counts = ListPacker.unpack(uri_list)
    total_count = sum(int(c) for _, c in uri_counts)
    print(label + '\t' + str(len(uri_counts)) + '\t' + str(total_count))
コード例 #4
0
def unpack_achors(line):
    label, uri_list = line.split('\t')
    # tokenize for commas
    label = ' '.join(tokenize_possessive(default_tokenize_func(label)))
    # should be only one
    uri_counts = ListPacker.unpack(uri_list)
    total_count = sum(int(c) for _, c in uri_counts)
    print(label + '\t' + str(len(uri_counts)) + '\t' + str(total_count))
コード例 #5
0
def link(sentence):
    tokens = default_tokenize_func(sentence)
    pos_tokens = nltk.pos_tag(tokens)
    candidates = extract_candidates(pos_tokens)
    if len(candidates) > 0:
        graph = SemanticGraph(candidates)
        graph.do_iterative_removal()
        graph.do_linking()
    return candidates
コード例 #6
0
def unpack_achors(line):
    label, uri_list = line.split('\t')
    # tokenize for commas
    label = ' '.join(tokenize_possessive(default_tokenize_func(label)))
    # should be only one
    uri_counts = ListPacker.unpack(uri_list)
    if len(uri_counts) > 1:
        return
    uri, count = uri_counts[0]
    print(label + '\t' + uri + '\t' + count)
コード例 #7
0
def filter_labels(line):
    label, uri_list = line.split('\t')
    # tokenize for commas
    label = ' '.join(tokenize_possessive(default_tokenize_func(label)))
    # should be only one
    uri_counts = [(uri, int(count)) for uri, count in ListPacker.unpack(uri_list)]
    total = sum(zip(*uri_counts)[1])

    for uri, count in uri_counts:
        if count/total > args.percentile and count > args.min_count:
            print(label + '\t' + uri + '\t' + str(count))
            break
コード例 #8
0
def filter_labels(line):
    label, uri_list = line.split('\t')
    # tokenize for commas
    label = ' '.join(tokenize_possessive(default_tokenize_func(label)))
    # should be only one
    uri_counts = [(uri, int(count))
                  for uri, count in ListPacker.unpack(uri_list)]
    total = sum(zip(*uri_counts)[1])

    for uri, count in uri_counts:
        if count / total > args.percentile and count > args.min_count:
            print(label + '\t' + uri + '\t' + str(count))
            break
コード例 #9
0
def generate_ngrams(line):
    labels = []
    line = line.strip()
    for sentence in line_filter(' '.join(tokenize_possessive(default_tokenize_func(line)))):
        sentence = sentence.split()
        i = 0
        while i < len(sentence):
            for j in range(min(len(sentence), i+20), i, -1):
                token = ' '.join(sentence[i:j])
                if i+1 == j and i == 0:
                    # if first word in sentence -> skip, could be wrong (Apple)
                    continue
                elif token in organic_label_dict:
                    labels.append(((token, organic_label_dict[token]), 1))
                    i = j-1
                    break
            i += 1
    return labels
コード例 #10
0
def generate_ngrams(line):
    labels = []
    line = line.strip()
    for sentence in line_filter(' '.join(
            tokenize_possessive(default_tokenize_func(line)))):
        sentence = sentence.split()
        i = 0
        while i < len(sentence):
            for j in range(min(len(sentence), i + 20), i, -1):
                token = ' '.join(sentence[i:j])
                if i + 1 == j and i == 0:
                    # if first word in sentence -> skip, could be wrong (Apple)
                    continue
                elif token in organic_label_dict:
                    labels.append(((token, organic_label_dict[token]), 1))
                    i = j - 1
                    break
            i += 1
    return labels