Ejemplo n.º 1
0
def affinity(infinitives):
    print "Extracting features..."
    X, _ = extract_features(infinitives, 3, False)
    X_norms = np.sum(X * X, axis=1)
    S = -X_norms[:, np.newaxis] - X_norms[np.newaxis, :] + 2 * np.dot(X, X.T)
    p = 10 * np.median(S)
    print "Fitting affinity propagation clustering..."
    af = AffinityPropagation().fit(S, p)
    indices = af.cluster_centers_indices_
    for i, idx in enumerate(indices):
        print i, infinitives[idx]

    n_clusters_ = len(indices)


    print "Fitting PCA..."
    X = RandomizedPCA(2).fit(X).transform(X)    
    
    print "Plotting..."
    pl.figure(1)
    pl.clf()
    
    colors = cycle('bgrcmyk')
    for k, col in zip(range(n_clusters_), colors):
        class_members = af.labels_ == k
        cluster_center = X[indices[k]]
        pl.plot(X[class_members,0], X[class_members,1], col+'.')
        pl.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
                                         markeredgecolor='k', markersize=14)
        for x in X[class_members]:
            pl.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col) 

    pl.title('Estimated number of clusters: %d' % n_clusters_)
    pl.show()
Ejemplo n.º 2
0
def k_clusters(k, infinitives):
    data, _ = extract_features(infinitives, 3, False)
    kmeans = KMeans(k=k).fit(data)
    print kmeans.inertia_
    nn = NeighborsClassifier(1).fit(data, np.zeros(data.shape[0]))
    _, idx = nn.kneighbors(kmeans.cluster_centers_)
    for inf in infinitives[idx.flatten()]:
        print inf
Ejemplo n.º 3
0
def plot_projection(model, infinitives, title):
    fig = pl.figure()
    # Binary model: n-gram appears or not
    for i in range(1, 4):  # n-gram length (1 to 3)    
        pl.subplot(2, 3, i)
        data, _ = extract_features(infinitives, i, False)
        projected_data = model.fit(data).transform(data)
        pl.scatter(projected_data[:, 0], projected_data[:, 1])
        pl.title('Binary %d-grams' % i)
    # Frequency model: count the occurences
    for i in range(1, 4):
        pl.subplot(2, 3, 3 + i)
        data, _ = extract_features(infinitives, i, True)
        projected_data = model.fit(data).transform(data)
        pl.scatter(projected_data[:, 0], projected_data[:, 1])
        pl.title('Count %d-grams' % i)
    fig.text(.5, .95, title, horizontalalignment='center')
    pl.figlegend()
    pl.show()
Ejemplo n.º 4
0
def getName():
    input = json.loads(request.data)['text']
    tkn_txt = preprocess.tokenize_text(input)
    while ("" in tkn_txt):
        tkn_txt.remove("")
    X = [preprocess.extract_features(preprocess.pos_tagger([tkn_txt])[0])]

    pred = predict.predict_label(X)
    inputArray = input.split(' ')
    respObj = {
        "data": [{
            'word': w,
            'isName': p
        } for w, p in zip(inputArray, pred[0])]
    }

    return jsonify(respObj)
Ejemplo n.º 5
0
    def get_features(self):
        i_channel_data = self.i_channel_data
        i_start = i_channel_data - self.classification_length

        if i_start < 0:
            length_from_end = -i_start
            length_from_start = self.classification_length - length_from_end

            channel_data_copy = np.hstack(
                (self.channel_data[:, 0:length_from_start],
                 self.channel_data[:, self.channel_data.shape[1] -
                                   length_from_end:]))
        else:
            channel_data_copy = self.channel_data[:, i_start:i_start +
                                                  self.classification_length]

        features, _ = extract_features(channel_data_copy)

        return features.flatten()
Ejemplo n.º 6
0
def get_inputs(dom):
    soup = BeautifulSoup(dom, 'html5lib')
    # remove unrelated tags
    for invisible_tag in ['style', 'script', '[document]', 'head', 'title']:
        for tag in soup.find_all(invisible_tag):
            tag.decompose()
    # special process for nested span uncaught in off-line forms ...
    for element in soup.find_all():
        if element.attrs:
            if element.has_attr('unselectable') and 'on' in element['unselectable']:
                element.clear()
                continue
    input_list = []
    for input_type in input_types:
        inputs = soup.find_all('input', attrs={'type': input_type})
        for my_input in inputs:
            if is_invisible(my_input):
                continue
            xpath = get_xpath(my_input)
            feature = extract_features(my_input)
            feature = ' '.join(sorted(feature.split()))
            input_list.append((feature, xpath))
    return input_list
Ejemplo n.º 7
0
import preprocess

tkn_txt = preprocess.tokenize_text('My name is Saikat')
while("" in tkn_txt) : 
    tkn_txt.remove("") 
print(preprocess.pos_tagger([tkn_txt]))
X = [preprocess.extract_features(preprocess.pos_tagger([tkn_txt])[0])]

print(preprocess.predict_label(X))