Esempi in Python per clean, esempi in Python per preprocessing.cleansing.clean

Esempio n. 1

0

Mostra file

def learn_triplets_cooccur_mat(triplets_file_path):
    files = glob.glob(triplets_file_path)
    np_voc = vocabulary.Vocabulary()
    vp_voc = vocabulary.Vocabulary()
    np_voc.load('../mat/np1.voc')
    vp_voc.load('../mat/np2.voc')
    num_np = np_voc.size()
    num_vp = vp_voc.size()
    cooccur_mat = zeros([num_np, num_vp])
    for file_in in files:
        with open(file_in, 'r') as f:
            for line in f:
                if (line[0] != '<'):
                    line = (line[:-2]).lower()
                    triplets = line.split('|')
                    np1 = cleansing.clean(triplets[0].split())
                    vp = cleansing.clean(triplets[1].split())
                    np2 = cleansing.clean(triplets[2].split())
                    for w in np2:
                        vp.append(w)
                    np1_new = [w for w in np1 if np_voc.contain(w)]
                    vp_new = [w for w in vp if vp_voc.contain(w)]

                    pairs = [(np_voc.get_word_index(u),
                              vp_voc.get_word_index(v)) for u in np1_new
                             for v in vp_new]
                    for pair in pairs:
                        cooccur_mat[pair[0], pair[1]] += 1
    return cooccur_mat

Esempio n. 2

0

Mostra file

File: triplets_mat.py Progetto: RedHenLab/CDI

def learn_triplets_cooccur_mat(triplets_file_path):
    files = glob.glob(triplets_file_path)
    np_voc = vocabulary.Vocabulary()
    vp_voc = vocabulary.Vocabulary()
    np_voc.load('../mat/np1.voc')
    vp_voc.load('../mat/np2.voc')
    num_np = np_voc.size()
    num_vp = vp_voc.size()
    cooccur_mat = zeros([num_np, num_vp])
    for file_in in files:
        with open(file_in, 'r') as f:
            for line in f:
                if(line[0] != '<'):
                    line = (line[:-2]).lower()
                    triplets = line.split('|')
                    np1 = cleansing.clean(triplets[0].split())
                    vp = cleansing.clean(triplets[1].split())
                    np2 = cleansing.clean(triplets[2].split())
                    for w in np2:
                        vp.append(w)
                    np1_new = [w for w in np1 if np_voc.contain(w)]
                    vp_new = [w for w in vp if vp_voc.contain(w)]

                    pairs = [(np_voc.get_word_index(u), vp_voc.get_word_index(v)) for u in np1_new for v in vp_new]
                    for pair in pairs:
                        cooccur_mat[pair[0], pair[1]] += 1
    return cooccur_mat

Esempio n. 3

0

Mostra file

File: storyclustering.py Progetto: roshniRam/CDI

def learn_story_histogram(file_in, words_voc, word_type='ALL', ocr_file=None):
    """word_type = 'NP1', 'VP', 'NP2', 'ALL'"""
    words_voc_num = words_voc.size()
    hist = np.zeros([1, words_voc_num])
    document = []

    with open(file_in, 'r') as f:
        for line in f:
            if(line[0] != '<'):
                line = (line[:-2]).lower()
                words = []
                triplets = line.split('|')
                if (word_type == 'NP1' or word_type == 'ALL'):
                    words.extend(cleansing.clean(triplets[0].split()))
                if (word_type == 'VP' or word_type == 'ALL'):
                    words.extend(cleansing.clean(triplets[1].split()))
                if (word_type == 'NP2' or word_type == 'ALL'):
                    words.extend(cleansing.clean(triplets[2].split()))

                words_new = [w for w in words if words_voc.contain(w)]
                document.extend(words_new)
                for w in words_new:
                    hist[0, words_voc.get_word_index(w)] += 1

    # Read OCR file and combine into histogram if provided.
    ocr_words = []
    if ocr_file is not None:
        # TODO:
        # read ocr file
        # combine OCR into histogram
        # each word count 2
        # fill ocr_words
        with open(ocr_file, 'r') as f:
            for line in f:
                if(line[0] != '<'):
                    line = (line[:-2]).lower()
                    ocr_words.extend(cleansing.clean(line.split()))
                    ocr_words_new = [w for w in ocr_words if words_voc.contain(w)]
                    document.extend(ocr_words_new)
                    for w in ocr_words_new:
                        hist[0, words_voc.get_word_index(w)] += 1

    # Normalize.
    sum_hist = hist.sum()
    if sum_hist != 0:
        hist = hist / sum_hist
        try:
            assert(hist.sum() > 0.9 and hist.sum() < 1.1)
        except AssertionError:
            print(hist)
            print(hist.sum())
            raise
    return (hist, document, ocr_words)

Esempio n. 4

0

Mostra file

File: storyclustering.py Progetto: hang-qi/CDI

def learn_story_histogram(file_in, words_voc, word_type="ALL", ocr_file=None):
    """word_type = 'NP1', 'VP', 'NP2', 'ALL'"""
    words_voc_num = words_voc.size()
    hist = np.zeros([1, words_voc_num])
    document = []

    with open(file_in, "r") as f:
        for line in f:
            if line[0] != "<":
                line = (line[:-2]).lower()
                words = []
                triplets = line.split("|")
                if word_type == "NP1" or word_type == "ALL":
                    words.extend(cleansing.clean(triplets[0].split()))
                if word_type == "VP" or word_type == "ALL":
                    words.extend(cleansing.clean(triplets[1].split()))
                if word_type == "NP2" or word_type == "ALL":
                    words.extend(cleansing.clean(triplets[2].split()))

                words_new = [w for w in words if words_voc.contain(w)]
                document.extend(words_new)
                for w in words_new:
                    hist[0, words_voc.get_word_index(w)] += 1

    # Read OCR file and combine into histogram if provided.
    ocr_words = []
    if ocr_file is not None:
        # TODO:
        # read ocr file
        # combine OCR into histogram
        # each word count 2
        # fill ocr_words
        with open(ocr_file, "r") as f:
            for line in f:
                if line[0] != "<":
                    line = (line[:-2]).lower()
                    ocr_words.extend(cleansing.clean(line.split()))
                    ocr_words_new = [w for w in ocr_words if words_voc.contain(w)]
                    document.extend(ocr_words_new)
                    for w in ocr_words_new:
                        hist[0, words_voc.get_word_index(w)] += 1

    # Normalize.
    sum_hist = hist.sum()
    if sum_hist != 0:
        hist = hist / sum_hist
        try:
            assert hist.sum() > 0.9 and hist.sum() < 1.1
        except AssertionError:
            print (hist)
            print (hist.sum())
            raise
    return (hist, document, ocr_words)

Esempio n. 5

0

Mostra file

def learn_triplets_cooccur_mat(file_in, co_mat_file):
    learned_co_mat = CooccurMatrix()
    learned_co_mat.load(co_mat_file)
    np_voc = learned_co_mat.vocabulary
    np1_matrix = learned_co_mat.matrix

    np1_all = vocabulary.Vocabulary()
    with open(file_in, 'r') as f:
        for line in f:
            if (line[0] != '<'):
                line = (line[:-2]).lower()
                triplets = line.split('|')
                np1 = cleansing.clean(triplets[0].split())

                # Delete words not in the similarity vocabulary.
                np1_new = [w for w in np1 if np_voc.contain(w)]
                for w in np1_new:
                    np1_all.add(w)

    num_np1 = np1_all.size()
    similarity_mat_np1 = zeros([num_np1, num_np1])
    for i in range(num_np1):
        for j in range(num_np1):
            similarity_mat_np1[i, j] = np1_matrix[
                np_voc.get_word_index(np1_all.get_word(i)),
                np_voc.get_word_index(np1_all.get_word(j))]

    return CooccurMatrix(similarity_mat_np1, np1_all)

Esempio n. 6

0

Mostra file

File: ocr_insert.py Progetto: roshniRam/CDI

def ocr_find_story(ocr_results, ocr_time, stories):
    reprog = re.compile(r"""\d\d\d\d\d\d\d\d\d\d(\d\d)(\d\d).+""", re.I | re.X)
    m = reprog.match(ocr_time)
    g = m.groups()
    start_time_minute = int(g[0])
    start_time_second = int(g[1])
    time_range_left_min = max([start_time_minute - 3, 0])
    time_range_right_min = min([start_time_minute + 3, 59])
    ocr_words = cleansing.clean(ocr_results)
    ocr_words = [w for w in ocr_words if len(w) > 2]
    if len(ocr_words) < 1:
        return -1
    overlap = 0
    count = 0
    for story in stories:
        if time_overlap(time_range_left_min, time_range_right_min,
                        start_time_second, story):
            #print count
            p = words_overlap_percentage(ocr_words, story)
            #print ocr_words[0]
            #print p
            if p > overlap:
                overlap = p
                story_id = count
                break
        count += 1
    if overlap >= 0.5:
        return story_id
    else:
        return -1

Esempio n. 7

0

Mostra file

File: cluster_triplets.py Progetto: RedHenLab/CDI

def learn_triplets_cooccur_mat(file_in, co_mat_file):
    learned_co_mat = CooccurMatrix()
    learned_co_mat.load(co_mat_file)
    np_voc = learned_co_mat.vocabulary
    np1_matrix = learned_co_mat.matrix

    np1_all = vocabulary.Vocabulary()
    with open(file_in, 'r') as f:
        for line in f:
            if(line[0] != '<'):
                line = (line[:-2]).lower()
                triplets = line.split('|')
                np1 = cleansing.clean(triplets[0].split())

                # Delete words not in the similarity vocabulary.
                np1_new = [w for w in np1 if np_voc.contain(w)]
                for w in np1_new:
                    np1_all.add(w)

    num_np1 = np1_all.size()
    similarity_mat_np1 = zeros([num_np1, num_np1])
    for i in range(num_np1):
        for j in range(num_np1):
            similarity_mat_np1[i, j] = np1_matrix[np_voc.get_word_index(np1_all.get_word(i)), np_voc.get_word_index(np1_all.get_word(j))]

    return CooccurMatrix(similarity_mat_np1, np1_all)

Esempio n. 8

0

Mostra file

File: ocr_insert.py Progetto: RedHenLab/CDI

def ocr_find_story(ocr_results, ocr_time, stories):
    reprog = re.compile(r"""\d\d\d\d\d\d\d\d\d\d(\d\d)(\d\d).+""", re.I | re.X)
    m = reprog.match(ocr_time)
    g = m.groups()
    start_time_minute = int(g[0])
    start_time_second = int(g[1])
    time_range_left_min = max([start_time_minute - 3, 0])
    time_range_right_min = min([start_time_minute + 3, 59])
    ocr_words = cleansing.clean(ocr_results)
    ocr_words = [w for w in ocr_words if len(w) > 2]
    if len(ocr_words) < 1:
        return -1
    overlap = 0
    count = 0
    for story in stories:
        if time_overlap(time_range_left_min, time_range_right_min, start_time_second, story):
            #print count
            p = words_overlap_percentage(ocr_words, story)
            #print ocr_words[0]
            #print p
            if p > overlap:
                overlap = p
                story_id = count
                break
        count += 1
    if overlap >= 0.5:
        return story_id
    else:
        return -1

Esempio n. 9

0

Mostra file

File: triplets_mat.py Progetto: RedHenLab/CDI

def generate_triplets_voc(files):
    triplet_voc = triplet_vocabulary.TripletVocabulary()
    for file_in in files:
        with open(file_in, 'r') as f:
            for line in f:
                if(line[0] != '<'):
                    line = (line[:-2]).lower()
                    triplets = line.split('|')
                    # remove stop words and save NP1, NP2 -> np, VP -> vp
                    np1 = cleansing.clean(triplets[0].split())
                    for w in np1:
                        triplet_voc.add_np(w)
                    vp = cleansing.clean(triplets[1].split())
                    for w in vp:
                        triplet_voc.add_vp(w)
                    np2 = cleansing.clean(triplets[2].split())
                    for w in np2:
                        triplet_voc.add_vp(w)
    return triplet_voc

Esempio n. 10

0

Mostra file

def generate_triplets_voc(files):
    triplet_voc = triplet_vocabulary.TripletVocabulary()
    for file_in in files:
        with open(file_in, 'r') as f:
            for line in f:
                if (line[0] != '<'):
                    line = (line[:-2]).lower()
                    triplets = line.split('|')
                    # remove stop words and save NP1, NP2 -> np, VP -> vp
                    np1 = cleansing.clean(triplets[0].split())
                    for w in np1:
                        triplet_voc.add_np(w)
                    vp = cleansing.clean(triplets[1].split())
                    for w in vp:
                        triplet_voc.add_vp(w)
                    np2 = cleansing.clean(triplets[2].split())
                    for w in np2:
                        triplet_voc.add_vp(w)
    return triplet_voc

Esempio n. 11

0

Mostra file

File: tree_pursuit.py Progetto: RedHenLab/CDI

def build_vocabulary(input_triplet_files, word_type='ALL'):
    vocab = vocabulary.Vocabulary()
    for triplet_file in input_triplet_files:
        with open(triplet_file, 'r') as f:
            for line in f:
                if(line[0] != '<'):
                    line = (line[:-2]).lower()
                    triplets = line.split('|')
                    words = []
                    if (word_type == 'NP1' or word_type == 'ALL'):
                        words.extend(cleansing.clean(triplets[0].split()))
                    if (word_type == 'VP' or word_type == 'ALL'):
                        words.extend(cleansing.clean(triplets[1].split()))
                    if (word_type == 'NP2' or word_type == 'ALL'):
                        words.extend(cleansing.clean(triplets[2].split()))

                    for w in words:
                        vocab.add(w)
    logging.info('Vocabulary: {0}, {1}.'.format(word_type, vocab.size()))
    return vocab

Esempio n. 12

0

Mostra file

def build_vocabulary(input_triplet_files, word_type='ALL'):
    vocab = vocabulary.Vocabulary()
    for triplet_file in input_triplet_files:
        with open(triplet_file, 'r') as f:
            for line in f:
                if (line[0] != '<'):
                    line = (line[:-2]).lower()
                    triplets = line.split('|')
                    words = []
                    if (word_type == 'NP1' or word_type == 'ALL'):
                        words.extend(cleansing.clean(triplets[0].split()))
                    if (word_type == 'VP' or word_type == 'ALL'):
                        words.extend(cleansing.clean(triplets[1].split()))
                    if (word_type == 'NP2' or word_type == 'ALL'):
                        words.extend(cleansing.clean(triplets[2].split()))

                    for w in words:
                        vocab.add(w)
    logging.info('Vocabulary: {0}, {1}.'.format(word_type, vocab.size()))
    return vocab

Esempio n. 13

0

Mostra file

 def add_line(self, line):
     words = cleansing.clean(line.split())
     for w in words:
         self.line_list.append(w)