Exemple #1
0
    def _readTraining(self, file_name, training_pairs):
        """Read training pairs from a file"""
        with open(file_name, "r") as f:
            training_pairs_raw = json.load(f)

        training_pairs = {0: [], 1: []}
        for (label, examples) in training_pairs_raw.iteritems():
            for pair in examples:
                training_pairs[int(label)].append((core.frozendict(pair[0]), core.frozendict(pair[1])))

        training_data = training.addTrainingData(training_pairs, self.data_model, self.training_data)

        return (training_pairs, training_data)
Exemple #2
0
    def _readTraining(self, file_name, training_pairs):
        """Read training pairs from a file"""
        with open(file_name, 'r') as f:
            training_pairs_raw = json.load(f, cls=self.training_decoder)

        training_pairs = {0: [], 1: []}
        for (label, examples) in training_pairs_raw.iteritems():
            for pair in examples:
                training_pairs[int(label)].append(
                    (core.frozendict(pair[0]), core.frozendict(pair[1])))

        training_data = training.addTrainingData(training_pairs,
                                                 self.data_model,
                                                 self.training_data)

        return (training_pairs, training_data)
def makeSampleDict(session_id, fields):
    session = worker_session
    engine = session.bind
    metadata = MetaData()
    proc_table = Table('processed_%s' % session_id, metadata, 
        autoload=True, autoload_with=engine)
    entity_table = Table('entity_%s' % session_id, metadata, 
        autoload=True, autoload_with=engine)
    result = {}
    cols = [getattr(proc_table.c, f) for f in fields]
    '''
    Get one record from each cluster of exact duplicates that are 
    already in entity map + all records that don't have entries in 
    the entity_map
    
    SELECT p.<fields from model>
      FROM processed as p
      LEFT JOIN entity as e
      WHERE e.target_record_id IS NULL
    '''
    curs = session.query(*cols)\
        .outerjoin(entity_table, 
            proc_table.c.record_id == entity_table.c.record_id)\
        .filter(entity_table.c.target_record_id == None)
    result = dict((i, frozendict(zip(fields, row))) 
                            for i, row in enumerate(curs))
    return result
def canonicalImport(filename):

    data_d = {}
    duplicates_d = {}
    with open(filename) as f:
        reader = csv.reader(f)
        header = reader.next()
        for i, row in enumerate(reader):
            instance = {}
            for j, col in enumerate(row):
                if header[j] == "unique_id":
                    duplicates_d.setdefault(col, []).append(i)
                else:
                    # we may want to think about removing common stop
                    # words
                    # col = col.strip()
                    # col = re.sub('[^a-z0-9 ]', ' ', col)
                    # col = re.sub('\.', ' ', col)
                    # col = re.sub(r'\bthe\b', ' ', col)
                    # col = re.sub(r'restaurant', ' ', col)
                    # col = re.sub(r'cafe', ' ', col)
                    # col = re.sub(r'diner', ' ', col)
                    # col = re.sub(r'\(.*\)', ' ', col)

                    # col = re.sub(r'\bn\.', ' ', col)
                    # col = re.sub(r'\bs\.', ' ', col)
                    # col = re.sub(r'\be\.', ' ', col)
                    # col = re.sub(r'\bw\.', ' ', col)
                    col = re.sub(r"\broad\b", "rd", col)
                    col = re.sub("  +", " ", col)

                    instance[header[j]] = col.strip().strip('"').strip("'")

            data_d[i] = frozendict(instance)

    duplicates_s = set([])
    for unique_id in duplicates_d:
        if len(duplicates_d[unique_id]) > 1:
            for pair in combinations(duplicates_d[unique_id], 2):
                duplicates_s.add(frozenset(pair))

    return (data_d, header, duplicates_s)