Ejemplo n.º 1
0
def process_ttriples():
    lines = set([])
    f_names = ['t_triples.nt']
    for f_name in f_names:
        i_file = open(i_dir + f_name, 'r')
        line = i_file.readline()
        while line:
            if line[0] == '#':
                line = i_file.readline()
                continue
            lines.add(line)
            line = i_file.readline()
        i_file.close()
    
    o_file = open(i_dir + 't_triples_preprocessed.tab', 'w+')
    for line in sorted(lines):
        line = line[0:-1]
        spo = ts.get_spo(line)
        if len(spo) < 3:
            continue

        s = urllib.unquote(spo[0])
        try:
            s = unicode(s, 'unicode-escape').encode('utf-8')
        except UnicodeDecodeError:
            pass
        s = detach_sprefix(s)
        s = normalize(s)
        p = urllib.unquote(spo[1])
        try:
            p = unicode(p, 'unicode-escape').encode('utf-8')
        except UnicodeDecodeError:
            pass
        p = detach_pprefix(p)
        p = normalize(p)
        o = urllib.unquote(spo[2])
        try:
            o = unicode(o, 'unicode-escape').encode('utf-8')
        except UnicodeDecodeError:
            pass
        o = detach_oprefix(o)
        o = normalize(o)

        o_file.write(s + '\t' + p + '\t' + o + '\n')

        if s != spo[0]:
            s_dictionary.add((s, spo[0]))
        if p != spo[1]:
            p_dictionary.add((p, spo[1]))
        if o != spo[2]:
            o_dictionary.add((o, spo[2]))

    o_file.close()
Ejemplo n.º 2
0
def process_cskos():
    i_file = open(i_dir + 'c_skos.nt', 'r')
    o_file = open(i_dir + 'c_skos_preprocessed.tab', 'w+')
    
    line = i_file.readline()
    while line:
        if line[0] == '#':
            line = i_file.readline()
            continue
        line = line[0:-1]
        spo = ts.get_spo(line)
        if len(spo) < 3:
            line = i_file.readline()
            continue

        if spo[1] != 'http://www.w3.org/2004/02/skos/core#broader':
            line = i_file.readline()
            continue

        l_c = urllib.unquote(spo[0])
        try:
            l_c = unicode(l_c, 'unicode-escape').encode('utf-8')
        except UnicodeDecodeError:
            pass
        l_c = detach_cprefix(l_c)
        l_c = normalize(l_c)
        r_c = urllib.unquote(spo[2])
        try:
            r_c = unicode(r_c, 'unicode-escape').encode('utf-8')
        except UnicodeDecodeError:
            pass
        r_c = detach_cprefix(r_c)
        r_c = normalize(r_c)

        o_file.write(l_c + '\t' + r_c + '\n')

        if l_c != spo[0]:
            c_dictionary.add((l_c, spo[0]))
        if r_c != spo[2]:
            c_dictionary.add((r_c, spo[2]))

        line = i_file.readline()

    o_file.close()
    i_file.close()
Ejemplo n.º 3
0
def process_ctriples():
    i_file = open(i_dir + 'c_triples.nt', 'r')
    o_file = open(i_dir + 'c_triples_preprocessed.tab', 'w+')
    
    line = i_file.readline()
    while line:
        if line[0] == '#':
            line = i_file.readline()
            continue
        line = line[0:-1]
        spo = ts.get_spo(line)
        if len(spo) < 3:
            line = i_file.readline()
            continue

        s = urllib.unquote(spo[0])
        try:
            s = unicode(s, 'unicode-escape').encode('utf-8')
        except UnicodeDecodeError:
            pass
        s = detach_sprefix(s)
        s = normalize(s)
        c = urllib.unquote(spo[2])
        try:
            c = unicode(c, 'unicode-escape').encode('utf-8')
        except UnicodeDecodeError:
            pass
        c = detach_cprefix(c)
        c = normalize(c)
        
        o_file.write(s + '\t' + 'categorizedIn' + '\t' + c + '\n')

        if s != spo[0]:
            s_dictionary.add((s, spo[0]))
        if c != spo[2]:
            c_dictionary.add((c, spo[2]))

        line = i_file.readline()

    o_file.close()
    i_file.close()