Example #1
0
def ml_neuronet():
    print('coding using ml...', end=' ')
    with open(NEURONET_PATH, 'rb') as f:
        mlp_nn = pickle.load(f)
    vectorizer = CharVectorizer(
        "abcdefghijklmnopqrstuvwxyzßäöü1234567890", fill_left_char=">", fill_right_char="<")

    target_length_in = 100

    records = []
    for serial, variable, position, answer in mssql_engine.execute(f'''
        select serial, variable, position, answer
        from open_brands_uncoded
        where {settings.SERIAL_CRITERIA}
        ''').fetchall():
        clean_answer = clean_verbatim(answer)[:100]
        data_in = vectorizer.transform([clean_answer], target_length_in)
        X = pd.DataFrame(data=data_in)
        probability = mlp_nn.predict_proba(X)
        max_prob = probability.max(axis = 1)[0]
        if max_prob >= NEURONET_CUTOFF:            
            prediction = mlp_nn.predict(X)[0]
            record = f'''({serial}, '{variable}', {position}, '{answer.replace("'", r"''")}', 'ml', {prediction}, {max_prob}, '')'''
            records.append(record)
        
    write_records(records, 'open_coded')
    print(f'{len(records)} records')
Example #2
0
def repeats():

    print(f'Coding using repeats...', end=' ')

    records = []
    for serial, variable, position, answer in mssql_engine.execute(f'''
        select serial, variable, position, answer
        from open_brands_uncoded
        where {settings.SERIAL_CRITERIA}
        ''').fetchall():
        clean_answer = clean_verbatim(answer)
        is_found = False
        for chunk in range(3, 0, -1):
            parsed = [clean_answer[i:i+chunk] for i in range(0, len(clean_answer), chunk)]
            if len(parsed) > 2:
                for i in range(len(parsed) - 2):
                    if parsed[i] == parsed[i+1] == parsed[i+2] \
                    and parsed[i] not in '   ' and parsed[i] not in '...':
                        is_found = True
                        found_pattern = parsed[i]
                        break
                else:
                    continue
                break
        if is_found:
            record = f'''({serial}, '{variable}', {position}, '{answer.replace("'", r"''")}', 'repeats', 0, 1, '{found_pattern}')'''
            records.append(record)

    write_records(records, 'open_coded')
    print(f'{len(records)} records')
Example #3
0
def lookup(func, func_name, library):

    print(f'Coding using {func_name}...', end=' ')
    records = []
    for serial, variable, position, answer in mssql_engine.execute(f'''
        select serial, variable, position, answer
        from open_brands_uncoded
        where {settings.SERIAL_CRITERIA}
        ''').fetchall():
        clean_answer = func(answer).strip().replace("'", r"''")
        if clean_answer in library:
            record = f'''({serial}, '{variable}', {position}, '{answer.replace("'", r"''")}', '{func_name}', {library[clean_answer]}, 1, '{clean_answer}')'''
            records.append(record)
    write_records(records, 'open_coded')
    print(f'{len(records)} records')
Example #4
0
def numbers():

    print(f'Coding using numbers...', end=' ')
    records = []
    for serial, variable, position, answer in mssql_engine.execute(f'''
        select serial, variable, position, answer
        from open_brands_uncoded
        where {settings.SERIAL_CRITERIA}
        ''').fetchall():
        clean_answer = clean_verbatim(answer)
        found = re.findall(REGEX_CRITERIA, clean_answer)

        if found:            
            candidate = ','.join(found)
            record = f'''({serial}, '{variable}', {position}, '{answer.replace("'", r"''")}', 'num', 0, 1, '{candidate}')'''
            records.append(record) 

    write_records(records, 'open_coded')
    print(f'{len(records)} records')
Example #5
0
def svss():
    print(f'Coding using svss...', end=' ')
    delete = ['ich weiss nicht', 'ich weiß nicht', 'ich weis nicht',
               'ich weiß nicht was', 'ich weiß', 'ich hatte bin', 'ich habe bin']
    records = []
    for serial, variable, position, answer in mssql_engine.execute(f'''
        select serial, variable, position, answer
        from open_brands_uncoded
        where {settings.SERIAL_CRITERIA}
        ''').fetchall():
        clean_answer = clean_verbatim(answer)
        if clean_answer in delete:
            record = f'''({serial}, '{variable}', {position}, '{answer.replace("'", r"''")}', 'svss', 0, 1, '{clean_answer}')'''
            records.append(record)
        if len(clean_answer) < 2:
            record = f'''({serial}, '{variable}', {position}, '{answer.replace("'", r"''")}', 'svss_len', 0, 1, '')'''
            records.append(record)

    write_records(records, 'open_coded')
    print(f'{len(records)} records')
Example #6
0
def lev():

    print(f'Coding using levenshtein...', end=' ')
    records = []
    for serial, variable, position, answer in mssql_engine.execute(f'''
        select serial, variable, position, answer
        from open_brands_uncoded
        where {settings.SERIAL_CRITERIA}
        ''').fetchall():
        clean_answer = clean_verbatim(answer)
        max_score, code, candidate = 0.0, 0, ''
        for v, c in clean_library.items():
            score = lv.ratio(clean_answer, v)
            if score > LEVENSHTEIN_CUTOFF and score > max_score:
                max_score, code, candidate = score, c, v
        if max_score > LEVENSHTEIN_CUTOFF:
            record = f'''({serial}, '{variable}', {position}, '{answer.replace("'", r"''")}', 'lev', {code}, {max_score}, '{candidate}')'''
            records.append(record)

    write_records(records, 'open_coded')
    print(f'{len(records)} records')
Example #7
0
def tokenize_answers():
    
    print('Tokenizing answers...', end=' ')
    records = []
    for serial, variable, answer in mssql_engine.execute(f'''
        select o.serial, o.variable, o.answer
        from import_open as o
            join open_variables as v
                on o.variable = v.variable
        where {settings.SERIAL_CRITERIA}
            and o.answer like '%' + char(10) + '%'
            and v.type = 'brand'
        ''').fetchall():
        tokenized_answers = answer.split('\n')
        for index, tokenized_answer in enumerate(tokenized_answers):
            if tokenized_answer:
                tokenized_answer = tokenized_answer.replace("'", "''")
                record = f'''({serial}, '{variable}', {index}, char(10), '{tokenized_answer}')'''
                records.append(record)
    write_records(records, 'open_tokenized')
    print(f'{len(records)} records')
Example #8
0
def bigramms():

    print(f'Coding using bigramms...', end=' ')
    records = []
    for serial, variable, position, answer in mssql_engine.execute(f'''
        select serial, variable, position, answer
        from open_brands_uncoded
        where {settings.SERIAL_CRITERIA}
        ''').fetchall():
        clean_answer = clean_verbatim(answer)
        count = 0
        found_bi = []
        for bi in BIGRAMMS:
            if bi in clean_answer:
                freq = clean_answer.count(bi) 
                count += freq
                found_bi.append(bi)
        if count >= 3:
            record = f'''({serial}, '{variable}', {position}, '{answer.replace("'", r"''")}', 'bigramms', 0, 1, '{','.join(found_bi)}')'''
            records.append(record)            

    write_records(records, 'open_coded')
    print(f'{len(records)} records')
Example #9
0
def lev_test():
    print(f'Coding using levenshtein...', end=' ')
    records = []
    idx = 0

    for serial, variable, position, answer in mssql_engine.execute(f'''
        select serial, variable, position, answer
        from lev_test
        ''').fetchall():
        clean_answer = clean_verbatim(answer)
        answer_results = []
        for verbatim, code in clean_library.items():
            score = lv.ratio(clean_answer, verbatim)
            if score > 0:
                answer_results.append((score, verbatim, code))

        answer_results.sort(key=lambda x: x[0], reverse=True)

        current_code = -1
        best_matches = []
        for score, verbatim, code in answer_results:
            if len(best_matches) == 2:
                break
            if current_code != code:
                best_matches.append(
                    f'''({serial}, '{variable}', {position}, '{clean_answer.replace("'", r"''")}', {len(best_matches)}, {score}, '{verbatim.replace("'", r"''")}', {code})'''
                )
                current_code = code

        records.extend(best_matches)

        idx += 1
        if idx % 10000 == 0:
            print(f'{idx} respondents processed')

    write_records(records, 'lev_test_results')
    print(f'{len(records)} records')