Exemple #1
0
def saveDictation():
    '''Endpoint that saves a sound file for later acoustic adaptation.

        Body:
            cookie: Cookie of current user.
        Files:
            url: Sound file.

        '''
    cookie = request.form['cookie']
    # Get current user
    res = database.find_one('connections', {'_id': cookie})
    email_name = res['email_name']
    text = request.form['text']
    url = request.files['url']

    out = os.path.join('./data', email_name)
    if not os.path.exists(out):
        os.makedirs(out)
    wav_path = os.path.join(out, 'wav')
    if not os.path.exists(wav_path):
        os.makedirs(wav_path)

    # Keep the number of dictations in the database.
    res = database.find_one('savedDictations', {'_id': email_name})
    if res is None:
        counter = 0
        database.insert_one('savedDictations', {
            '_id': email_name,
            'num': counter
        })
    else:
        counter = res['num'] + 1
        database.update_one('savedDictations', {'_id': email_name},
                            {"$set": {
                                'num': counter
                            }})

    # Keep a file that contains all the ids.
    with open(os.path.join(out, 'ids'), 'a') as f:
        f.write(str(counter) + '\n')
    # Keep a file that contains all the transcriptions.
    with open(os.path.join(out, 'transcriptions'), 'a') as f:
        f.write('<s> ' + text.strip('\n') + ' </s> ' + ' (' + str(counter) +
                ')' + '\n')

    # Save current dictation in filesystem.
    url.save(os.path.join(wav_path, str(counter) + '.wav'))
    return {'message': 'OK'}
Exemple #2
0
def log_out():
    '''Endpoint that removes user's data.

        Body:
            cookie: Cookie of current user.

        '''
    cookie = request.form['cookie']
    # Get current user
    res = database.find_one('connections', {'_id': cookie})
    email_name = res['email_name']
    keep = res['keep']
    database.delete_one('connections', {'_id': cookie})

    # If user don't want to keep his data, remove all of them.
    if not keep == "yes":
        database.delete_one('messages', {'_id': email_name})
        database.delete_one('clusters', {'_id': email_name})
        database.delete_one('saveDictations', {'_id': email_name})
        database.delete_one('users', {'_id': email_name})
        out = os.path.join("./data", email_name)
        if os.path.exists(out):
            shutil.rmtree(out)

    return jsonify({'message': 'OK'})
Exemple #3
0
def getDictation():
    '''Endpoint that decodes speech to text.

        Body:
            cookie: Cookie of current user.
        Files:
            url: Sound file to decode.

        '''
    cookie = request.form['cookie']
    url = request.files['url']

    # Get current user
    res = database.find_one('connections', {'_id': cookie})
    email_name = res['email_name']

    # Save current dictation in filesystem.
    out = os.path.join('./data', email_name)
    mllr_path = os.path.join(out, 'acoustic')
    url.save(os.path.join(out, 'curr_dictation.wav'))

    # Speech to text using default models.
    decoded_gen = get_text_pocketsphinx(out, lmPath, acousticPath, dictPath,
                                        "")

    # Classify output text to computed clusters.
    res = database.find_one('clusters', {'_id': email_name})
    centers = res['centers']
    metric = res['metric']
    doc = nlp(decoded_gen)
    decoded_gen_spacy = doc.vector
    cluster = closest_cluster(np.array(centers), decoded_gen_spacy, metric)
    clusterPath = os.path.join(out, 'clusters/cluster_' + str(cluster))
    lmAdaptPath = os.path.join(clusterPath, 'merged.lm')
    # Speecg to text using adapted models.
    decoded_adapt = get_text_pocketsphinx(out, lmAdaptPath, acousticPath,
                                          dictPath, mllr_path)

    # Compute possible error words.
    sent_errors = error_detector(lmAdaptPath, decoded_adapt, 0.05)
    returned_data = {
        'text_gen': decoded_gen,
        'text_adapt': decoded_adapt,
        'cluster': cluster,
        'errors': sent_errors
    }
    return jsonify(returned_data)
Exemple #4
0
def get_random_email():
    '''Endpoint that returns a random email sentence to the user.

        Body:
            cookie: Cookie of current user.
        '''

    cookie = request.form['cookie']
    # Get current user
    res = database.find_one('connections', {'_id': cookie})
    email_name = res['email_name']
    res = database.find_one('messages', {'_id': email_name})
    messages_col = res['messages']
    sentences = []
    for msg in messages_col:
        sentences.extend(msg['processed_body'])
    sel_sentence = random.choice(sentences)
    return jsonify({'email': sel_sentence})
Exemple #5
0
def adapt_acoustic():
    '''Endpoint that adapts default acoustic model in user's previous dictations

        Body:
            cookie: Cookie of current user.
        '''
    cookie = request.form['cookie']
    # Get current user
    res = database.find_one('connections', {'_id': cookie})
    email_name = res['email_name']

    # Define necessary paths.
    out = os.path.join('./data', email_name)
    wav_path = os.path.join(out, 'wav')
    output = os.path.join(out, 'acoustic/')
    ids = os.path.join(out, 'ids')
    wav = os.path.join(out, 'wav')
    transcriptions = os.path.join(out, 'transcriptions')
    feat_params = os.path.join(acousticPath, 'feat.params')
    mfc_path = os.path.join(output, 'mfc')

    # Remove previous adaptation if exists.
    if os.path.exists(output):
        shutil.rmtree(output)

    # Generate counts.
    generate_command = 'sphinx_fe -argfile ' + feat_params + ' -samprate 16000 -c ' + \
        ids + ' -di ' + wav + ' -do ' + mfc_path + ' -ei wav -eo mfc -mswav yes'
    if subprocess.call([generate_command], shell=True):
        print('Error in subprocess')
    shutil.copy2(sphinxtrain + 'bw', output)
    shutil.copy2(sphinxtrain + 'map_adapt', output)
    shutil.copy2(sphinxtrain + 'mk_s2sendump', output)

    mdef_path = os.path.join(acousticPath, 'mdef.txt')
    counts_path = os.path.join(output, 'counts')
    os.makedirs(counts_path)
    feature_path = os.path.join(acousticPath, 'feature_transform')
    bw_command = output + 'bw -hmmdir ' + acousticPath + ' -cepdir ' + mfc_path + ' -moddeffn ' + mdef_path + ' -ts2cbfn .cont. -feat 1s_c_d_dd -cmn batch -agc none \
                        -dictfn ' + dictPath + ' -ctlfn ' + ids + ' -lsnfn ' + transcriptions + ' -accumdir ' + counts_path + ' -lda ' + feature_path + ' -varnorm no -cmninit 40,3,-1'
    if subprocess.call([bw_command], shell=True):
        print('Error in subprocess')

    shutil.copy2(sphinxtrain + 'mllr_solve', output)
    means_path = os.path.join(acousticPath, 'means')
    variance_path = os.path.join(acousticPath, 'variances')
    mllr_path = os.path.join(output, 'mllr_matrix')
    mllr_command = './' + output + 'mllr_solve -meanfn ' + means_path + ' -varfn ' + variance_path + \
        ' -outmllrfn ' + mllr_path + ' -accumdir ' + counts_path
    if subprocess.call([mllr_command], shell=True):
        print('Error in subprocess')

    return {'message': 'OK'}
Exemple #6
0
def login():
    username = request.form.get("username", default=None)
    password = request.form.get("password", default=None)
    if (not username or not password or username == "" or password == ""):
        req = {"code": "0001", "result": "用户名或密码格式错误", "data": {}}
        return jsonify(req)
    print(request.form)
    ret = database.find_one(collection_name, {
        "username": username,
        "password": password
    })
    if ret == None:
        req = {"code": "0001", "result": "用户名或密码错误", "data": {}}
        return jsonify(req)
    else:
        req = {
            "code": "0000",
            "result": "登陆成功",
            "data": {
                "username": ret["username"],
                "token": str(ret["_id"])
            }
        }
        return jsonify(req)
Exemple #7
0
def get_user_info(username):
    '''
    return user starred repos if user exist in db
    '''
    user = find_one('online_user', {'name': username})
    return user
Exemple #8
0
def getEmails():
    '''Endpoint that returns the sent emails of a Gmail user.

        Body:
            cookie: Cookie of current user.
            token: Authentication token from Gmail API.
            keep: If true, keep current user;s data after log out.
        '''

    data = request.form
    token = data['token']
    cookie = data['cookie']
    keep = data['keep']
    # Get user's info and save a new connection in the database.
    email_name, name, picture = getInfo(token)
    database.insert_one('connections', {
        '_id': cookie,
        'email_name': email_name,
        'keep': keep
    })

    # If user's data exist, return the emails.
    res = database.find_one('users', {'_id': email_name})
    if res is not None:
        res = database.find_one('messages', {'_id': email_name})
        if res is not None:
            messages = res['messages']
            return jsonify(messages)
        else:
            # In case data are not sychronized correctly.
            database.delete_one('users', {'_id': email_name})

    database.insert_one('users', {
        '_id': email_name,
        'name': name,
        'picture': picture
    })
    # Send get request in gmail api to get the user's emails.
    read_endpoint = "https://www.googleapis.com/gmail/v1/users/userId/messages"
    headers = {
        'Authorization': 'Bearer ' + token,
        'Accept': 'application/json'
    }
    read_response = requests.get(read_endpoint,
                                 headers=headers,
                                 params={
                                     'userId': 'me',
                                     'labelIds': ['SENT']
                                 })
    messages = read_response.json().get('messages')
    clean_messages = []
    for idx, message in enumerate(messages):
        # Get message based in the id.
        get_endpoint = "https://www.googleapis.com/gmail/v1/users/userId/messages/id"
        get_response = requests.get(get_endpoint,
                                    headers=headers,
                                    params={
                                        'userId': 'me',
                                        'id': message['id'],
                                        'format': 'raw'
                                    })
        raw_msg = get_response.json().get("raw")
        string_message = str(base64.urlsafe_b64decode(raw_msg), "ISO-8859-7")
        # Convert current message to mime format.
        mime_msg = email.message_from_string(string_message)
        # Convert current message from mime to string.
        body, msg_headers = mime2str(mime_msg)
        # Clean current message
        proccesed_body = process_text(body)
        size = len(msg_headers)
        clean_messages.append({
            'body': body,
            'processed_body': proccesed_body,
            'sender': (msg_headers[0] if size > 0 else " "),
            'subject': (msg_headers[2] if size > 2 else " ")
        })

    # Save user's emails in database and return them.
    database.insert_one('messages', {
        '_id': email_name,
        'messages': clean_messages
    })
    return jsonify(clean_messages)
Exemple #9
0
def getClusters():
    '''Endpoint that clusters the emails.

        Body:
            cookie: Cookie of current user.
            metric: Metric to be used for closest point calculation.
            n_clusters: Number of clusters.
            method: Method of selecting number of clusters to be used (knee, silhouette).
            min_cl: Min number of clusters.
            max_cl: Max number of clusters.
            level: Level of clustering (per sentence or per email).
        '''
    data = request.form
    cookie = data['cookie']
    metric = data['metric']
    n_clusters = data['n_clusters']
    method = data['method']
    min_cl = int(data['min_cl'])
    max_cl = int(data['max_cl'])
    level = data['level']

    # Get current user.
    res = database.find_one('connections', {'_id': cookie})
    email_name = res['email_name']

    # Get messages of current user.
    res = database.find_one('messages', {'_id': email_name})
    messages_col = res['messages']
    # Keep them as sentences if asked to.
    emails = []
    for msg in messages_col:
        if level == "sentence":
            emails.extend(msg['processed_body'])
        else:
            emails.append(" ".join(msg['processed_body']))

    # Represent them as vectors.
    X = get_spacy(emails, nlp)

    if n_clusters == "":
        # Get metrics in different number of clusters (range [min_cl, max_cl]).
        sse, silhouette = get_metrics(X, min_cl, max_cl)
        if method == 'elbow':
            n_clusters = find_knee(sse, min_cl)
        else:
            n_clusters = silhouette_analysis(silhouette, min_cl)
    # Run k-means with given number of clusters.
    n_clusters = int(n_clusters)
    labels, centers = run_kmeans(X, n_clusters)

    # Save computed clusters in filesystem.
    out = os.path.join('./data', os.path.join(email_name, 'clusters'))
    save_clusters(emails, labels, os.path.join(email_name, 'clusters'))
    cluster2text(out, n_clusters)

    # Get a sample for each cluster.
    samples = []
    for i in range(n_clusters):
        samples.append(emails[closest_point(centers[i], X, metric)])

    # We want to keep some representative words for each cluster
    # in order to identify the topic it represents. So we take
    # the words with the heighest tf-idf metric in each cluster.
    cv = CountVectorizer(stop_words=STOP_WORDS)
    tfidf = TfidfTransformer(smooth_idf=True, use_idf=True)
    keywords_total = []
    for i in range(n_clusters):
        emails_cluster = [
            emails[j] for j in range(len(emails)) if labels[j] == i
        ]
        word_count_vector = cv.fit_transform(emails_cluster)
        tfidf.fit(word_count_vector)
        feature_names = cv.get_feature_names()
        tf_idf_vector = tfidf.transform(cv.transform(emails_cluster))
        sorted_items = sort_coo(tf_idf_vector.tocoo())
        keywords = extract_topn_from_vector(feature_names, sorted_items, 10)
        keywords_total.append(keywords)

    # Delete previous user's clustering.
    database.delete_one('clusters', {'_id': email_name})
    # Insert computed clusters in database.
    database.insert_one(
        'clusters', {
            '_id': email_name,
            'centers': centers.tolist(),
            'labels': labels.tolist(),
            'samples': samples,
            'keywords': keywords_total,
            'metric': metric
        })

    clusters = [[] for i in range(n_clusters)]
    for idx, email in enumerate(emails):
        clusters[labels[idx]].append(email)

    weight = '0.5'
    # Create language models using srilm.
    for cluster in os.listdir(out):
        cluster_path = os.path.join(out, cluster)
        if os.path.isdir(cluster_path):
            if subprocess.call([
                    'ngram-count -kndiscount -interpolate -text ' +
                    os.path.join(cluster_path, 'corpus') +
                    ' -wbdiscount1 -wbdiscount2 -wbdiscount3 -lm ' +
                    os.path.join(cluster_path, 'model.lm')
            ],
                               shell=True):
                print('Error in subprocess')
            if subprocess.call([
                    'ngram -lm ' + lmPath + ' -mix-lm ' +
                    os.path.join(cluster_path, 'model.lm') + ' -lambda ' +
                    weight + ' -write-lm ' +
                    os.path.join(cluster_path, 'merged.lm')
            ],
                               shell=True):
                print('Error in subprocess')

    return jsonify({
        'samples': samples,
        'keywords': keywords_total,
        'clusters': clusters
    })
Exemple #10
0
def index():
    return str(database.find_one(database="test", collection="test"))