Esempio n. 1
0
def create_model(doc_list):
    '''
    Given a list of documents in Pattern.Vector Document format, create a
    Pattern.Vector Model.
    '''
    print "Creating a TFIDF model for {} documents".format(len(doc_list))
    return Model(documents=doc_list, weight=TFIDF)
Esempio n. 2
0
def word_ranking(text, n='L2'):
    """
    extract most relevant sentences from text according to LSA algorithm
    steps:    
    1. tokenize text by sentences
    2. compute tfidf matrix
    3. applying SVD of tfidf matrix (reduce to n-dimensions) 
    4. ranking sentences according to cross-method (source: http://www.aclweb.org/anthology/C10-1098.pdf)
        
    - text: string consisting of a few sentences
    - n: number of sentences to extract
    
    """
    # tokenize text to sentences list
    sentences = tokenize(text)

    #==============================================================================
    #     #synctatic filter
    #     exclude_list = []
    #     for sent in sentences:
    #         for word, pos in tag(sent):
    #             if pos != "JJ" or pos != 'NN': # Retrieve all adjectives and nouns.
    #                 exclude_list.append(word.lower())
    #==============================================================================

    # create documents list
    # stop words and punctuation erase by default
    docs = [Document(sentences[i], name=i) for i in range(len(sentences))]

    # model initialize
    m = Model(docs, weight=TFIDF)

    # dimensions number equal to euclidean norm of singular values
    # U, S, Vt = np.linalg.svd(m.vectors, full_matrices=False)
    # dimensions=int(round(np.linalg.norm(S, 2)))
    m.reduce(dimensions=n)

    # sentences selection according to cross-method
    # source: http://www.ceng.metu.edu.tr/~e1395383/papers/TextSummarizationUsingLSA(Journal).pdf
    # topic(rows) x tokens(cols) matrix(tfidf)
    V = np.array(m.lsa.vt)

    # average sentence score for each concept/topic by the rows of the Vt matrix
    avg_score = np.mean(V, axis=1).reshape((-1, 1))

    # cell values which are less than or equal to the average score are set to zero
    V[V <= avg_score] = 0.0

    # sigma natrix after svd performing
    S = np.array(m.lsa.sigma).reshape((-1, 1))

    # total length of each sentence vector
    length = np.sum(V * S, axis=0)

    # ranking words by length score
    ranking = Counter(dict(zip(m.lsa.terms, length)))  #.most_common(n)

    #words, score =  list(zip(*ranking))

    return ranking
Esempio n. 3
0
def articles_to_trends(articles):
    news = {}
    for story in articles:
        if story['added_at']:
            article_text = get_article_text(story['url'])
            d, s = timestamptext(story['added_at'], article_text)

            # Each key in the news dictionary is a date: news is grouped per day.
            # Each value is a dictionary of id => story items.
            # We use hash(story['summary']) as a unique id to avoid duplicate
            # content.
            news.setdefault(d, {})[hash(s)] = s

    m = Model()
    for date, stories in news.items():
        s = stories.values()
        s = ' '.join(s).lower()
        # Each day of news is a single document.
        # By adding all documents to a model we can calculate tf-idf.
        m.append(Document(s, stemmer=LEMMA, exclude=[
                 'news', 'day'], name=date))

    for document in m:
        print document.name
        print document.keywords(top=10)
Esempio n. 4
0
def feeds_to_trends(feeds):
    for url in feeds:
        url = url['feed_url']
        news = {}
        try:
            for story in Newsfeed().search(url, cached=False):
                d, s = datetext(story.date, story.description)

                # Each key in the news dictionary is a date: news is grouped per day.
                # Each value is a dictionary of id => story items.
                # We use hash(story.description) as a unique id to avoid duplicate
                # content.
                news.setdefault(d, {})[hash(s)] = s

            m = Model()
            for date, stories in news.items():
                s = stories.values()
                s = ' '.join(s).lower()
                # Each day of news is a single document.
                # By adding all documents to a model we can calculate tf-idf.
                m.append(Document(s, stemmer=LEMMA, exclude=[
                         'news', 'day'], name=date))

            for document in m:
                print document.name
                print document.keywords(top=10)
        except HTTP404NotFound:
            print url
            pass
Esempio n. 5
0
def summarize(text, n=1):
    """
    extract most relevant sentences from text according to TextRank algorithm
    - text: string consisting of a few sentences
    - n: number of sentences to extract
    """
    # tokenize text to sentences list
    sentences = tokenize(text)

    # create documents list
    # stop words and punctuation erase by default
    docs = [Document(sentences[i], name=i) for i in range(len(sentences))]

    # model initialize
    m = Model(docs, weight=TFIDF)

    # dict of TextRank ranking of cosine similarity matrix
    ranking = utils.textrank(m.documents, m.distance)

    # indexes of top n sentences
    top_sents_idx, _ = list(zip(*ranking.most_common(n)))

    # reordering
    output = [sentences[i] for i in sorted(top_sents_idx)]

    return ''.join(output)
Esempio n. 6
0
def build_model(results=[]):
    documents = [
        Document(i.get('text'),
                 name=i.get('url'),
                 description=i.get('index'),
                 stemmer=LEMMA) for i in results
    ]
    m = Model(documents, weight=TFIDF)

    y, x = 1, len(m.features)
    model = np.zeros((y, x))

    sentence_dict = {}
    model_sentences = []
    for i_index, i in enumerate(documents):
        sentences = sent_tokenize(results[i_index].get('text').lower())

        dy, dx = len(sentences), x
        for s_index, s in enumerate(sentences):
            s_words = {
                w: 1
                for w in words(s, stemmer=LEMMA, stopwords=False)
                if not stopwords_hash.get(w)
            }
            if len(s_words) < 5:
                continue
            model_sentences.append(s)
            model = np.append(
                model, [[1 if s_words.get(w) else 0 for w in m.features]], 0)
            sentence_dict[model.shape[0] - 1] = i.name
            # model_sentences[model.shape[0]-1] = s

    model = np.delete(model, (0), 0)

    return model, m, model_sentences, sentence_dict
Esempio n. 7
0
def getMod():
    essay_path = 'essays/original/'
    files = fio.recGetTextFiles(path.abspath(essay_path))
    docs = []
    for f in files:
        with io.open(f, 'r', encoding='utf-8') as w:
            text = TextBlob(PageParser.parse(w.read()))
            text = ' '.join([
                word for word in text.words if word not in cachedStopWords
            ]).lstrip()
            #ent_text = ' '.join(er.recognize_entities(text.sentences))
            #ent_text = PageParser.parse(w.read())
            docs.append(Document(text, name=f, top=40))
    m = Model(docs)
    lsa = m.reduce(5)
    return lsa
    # Clustering could be a useful technique, commenting out for now
    #with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w:
    #	write_cluster(m.cluster(method=HIERARCHICAL, k=4), w, "")

    with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w:
        for i, concept in enumerate(m.lsa.concepts):
            print("Concept {0}:".format(i)),
            w.write(unicode("Concept {0}:".format(i)))
            count = 0
            # Show top only first 5 features we come across
        for feature, weight in m.lsa.concepts[i].items():
            if abs(weight) > 0.2:
                print(feature),
                w.write(feature + " ")
                count += 1

            if count > 5:
                break
        w.write(unicode('\n'))
        #print

        cat_docs = []
        for d in m.documents:
            cat = (0, 0, {})
            #print d.name.split('\\')[-1]
            for idx, weight in m.lsa.vectors[d.id].items():
                print "\tCat {0}: {1}".format(idx, weight)
                if abs(weight) > abs(cat[1]) or cat[1] == 0:
                    cat = (idx, weight, d)

            if cat[0] == i:
                cat_docs.append(cat)
                #print "\t{0}".format(d.name.split('\\')[-1])

        cat_docs.sort(key=lambda tup: abs(tup[1]), reverse=True)
        for cat, weight, d in cat_docs:
            f = d.name.split('\\')[-1]
            w.write(
                unicode("\t{0} - {1}\n").format(
                    filter(lambda x: x in string.printable, f), weight))
Esempio n. 8
0
    def rankingSVM(self, listaUrls, consulta, parametros):
        """ metodo para rankear una lista de urls mediante el algoritmo RSVM
            Entrada:
                listaUrls: lista de los urls para rankear
                consulta: consulta de busqueda en cadena de caracteres
                parametros: parametros
            Salida:
                lista de urls rankeados
        """

        self.preprocesamiento.lecturaSVMRanking(listaUrls, consulta)
        """ creacion de atributos para cada enlace"""
        listaUrls = self.setearAtributosRanking(listaUrls, consulta)
        """se obtiene los puntos para realizar el ranking"""
        puntos = self.getAtributosRanking(listaUrls, consulta.name)
        X = np.array(puntos['X'])

        svmNorelevante = joblib.load('Model/SVM/norelevante.pkl')
        svmRelevante = joblib.load('Model/SVM/relevante.pkl')
        svmMuyrelevante = joblib.load('Model/SVM/muyrelevante.pkl')

        prediccionesNoRelevante = svmNorelevante.predict(X)
        prediccionesRelevante = svmRelevante.predict(X)
        prediccionesMuyRelevante = svmMuyrelevante.predict(X)

        listaUrls = self.preprocesamiento.limpiarListaUrls(
            listaUrls, puntos['name'])
        ranking = []

        modeloLista = []
        for url in listaUrls:
            documento = self.mongodb.getDocumento(url)
            if documento:
                documentoPattern = self.preprocesamiento.getDocumentoPattern(
                    documento['_id'])
                modeloLista.append(documentoPattern)

        unModelo = Model(modeloLista)
        """calculo del puntaje de ranking SVM"""
        for indice, doc in enumerate(unModelo):
            url = doc.name
            documento = {}
            documento['url'] = url
            documento['score'] = (
                1 - self.obtenerVectorSpaceModel(doc, consulta)) + (
                    prediccionesNoRelevante[indice] +
                    prediccionesRelevante[indice] * parametros[1] +
                    prediccionesMuyRelevante[indice] * parametros[2])
            ranking.append(documento)

        listaNueva = sorted(ranking, key=lambda k: k['score'], reverse=True)
        return listaNueva
Esempio n. 9
0
def extract():
    print 'Extracting features from app descriptions...\n'
    if os.path.exists(OUTPUT_PATH):
        shutil.rmtree(OUTPUT_PATH)
    os.makedirs(OUTPUT_PATH)

    for dir in os.listdir(INPUT_PATH):
        if not dir.startswith('.'):
            os.makedirs("{}/{}".format(OUTPUT_PATH, dir))
            for file in os.listdir('{}/'.format(INPUT_PATH) + dir):
                with open('{}/{}/{}'.format(INPUT_PATH, dir, file), 'rb') as f:
                    reader = csv.reader(f)
                    next(reader)
                    with open('{}/{}/{}'.format(OUTPUT_PATH, dir, file),
                              'wb') as r:
                        writer = csv.writer(r)
                        for app in reader:
                            name = app[0]
                            description = app[2]

                            # Prepare an app description string for NLTK and LDA processing
                            preparedDescription = prepare_description(
                                description)

                            # Extract 3 word featurlets from the description
                            featurelets = featurelet_extraction(
                                preparedDescription)

                            list = []
                            for feature in featurelets:
                                featurelet = '{} {} {}'.format(
                                    feature[0], feature[1], feature[2])
                                list.append(
                                    Document(featurelet, name=featurelet))

                            # Perform hierarchical clustering
                            m = Model(list)
                            cluster = m.cluster(method=HIERARCHICAL,
                                                k=3,
                                                iterations=1000,
                                                distance=COSINE)

                            # Organize clusters into features and alternative tokens
                            (features,
                             alterTokens) = group(cluster, [], [], [])

                            # Write results to file
                            writer.writerow(
                                [name, description, features, alterTokens])
                        r.close()
                    f.close()
def get_model_from_documents(path='./*/*.txt'):
    '''return model from given txt files'''
    import codecs
    import glob
    from pattern.vector import Document, Model, TFIDF

    documents = []
    files = glob.glob('./*/*.*')
    for file in files:
        f = codecs.open(file, 'r')
        data = f.read()
        document = Document(data)
        documents.append(document)

    model = Model(documents=documents, weight=TFIDF)
    return documents, model
Esempio n. 11
0
def GetVectors():
    essay_path = 'training'
    files = fio.recGetTextFiles(path.abspath(essay_path))
    docs = []
    percepticon = PerceptronTagger()
    cat_dict = defaultdict(int)
    for f in files:
        extended_text = ExtendText(f, percepticon)
        name = ''
        cats = ['high', 'medium', 'low']
        for cat in cats:
            if cat in f:
                name = cat + str(cat_dict[cat])
                cat_dict[cat] += 1
        docs.append(Document(extended_text, name=name, top=None))
    m = Model(docs)
    #lsa = m.reduce(5)
    return m
Esempio n. 12
0
def r2iterator_to_model(collection, query):
    r2_list = []
    for r2 in collection.get_by_example(query):
        try:
            strings = [r2['program_desc']]
            for projid in r2['projects'].keys():
                try:
                    strings.append(r2['projects'][projid]['mission_desc'])
                except KeyError:
                    pass
            try:
                doc = Document(" ".join(strings), name=r2['_id'])
                r2_list.append(doc)
            except TypeError as e:
                print repr(e)
                print r2['_id']
        except KeyError as e:
            print repr(e)
    return Model(r2_list)
Esempio n. 13
0
 def calculate(self, minePackage):
     webDocuments = []
     query = Document((minePackage['searchKey']))
     clouds = minePackage['clouds']
     count = UnPack()
     totalLinks = count.total(clouds)
     urlContent = UrlToPlainText()
     step = 0
     for cloud in clouds:
         for n in cloud.graph.nodes():
             doc = cloud.graph.node[n]['methodData']
             webDocuments.append(Document(doc.getData()))
             step += 1
     m = Model(documents=webDocuments, weight=TFIDF)
     for cloud in clouds:
         for n in cloud.graph.nodes():
             methodData = cloud.graph.node[n]['methodData']
             vector = Document(methodData.getData())
             cloud.graph.node[n]['weight_VSM'] = m.similarity(
                 vector,
                 query)  #SETEA EL VALOR DE VSM EN EL CLOUD!!!!!!!!!!
Esempio n. 14
0
    def rankingVectorSpaceModel(self, listaUrls, consulta):
        """metodo para el ranking mediante VSM
        Entrada: Consulta de busqueda en string, y lista de urls
        Salida: lista final rankeado"""
        listaUrlsRankeados = []
        listaModel = []
        for url in listaUrls:
            documento = self.mongodb.getDocumento(url)
            if documento:
                documentoPattern = self.preprocesamiento.getDocumentoPattern(
                    documento['_id'])
                listaModel.append(documentoPattern)

        unModelo = Model(listaModel, weight=TFIDF)
        for unDocumento in unModelo:
            score = self.svm.calcularVectorSpaceModel(consulta, unDocumento)
            listaUrlsRankeados.append(
                self.crearJsonRanking(unDocumento.name, score))

        listaFinal = sorted(listaUrlsRankeados,
                            key=lambda k: k['score'],
                            reverse=False)

        return listaFinal
def main():

    ##############################################################################################
    print('QUESTION 1, Part I: Web Crawling: Extraction of Book Titles')
    print("-" * 70)
    print('\n')
    print(
        'Retrieving Book Titles from the first two pages of Amazon search results! \n'
    )
    print('Please wait a minute... \n')

    print("~" * 70)

    #open the base URL webpage
    level_1_url = "https://www.amazon.com/s?url=search-alias%3Daps&field-keywords=Martin+Heidegger"

    all_titles = get_titles(level_1_url)

    #print with text wrapping
    format = '%s'

    pieces = [format % (ttl) for ttl in all_titles]
    output = ' | '.join(pieces)
    ttls = fill(output)
    print('The scraped book titles are:')
    print("_" * 40)
    print('\n')
    print('\n\n'.join(ttls.split('|')))
    print('\n')

    ##############################################################################################
    print(
        'QUESTION 1, Part II: Pairwise Text Cosine Similarity Scores of Book Titles'
    )
    print("-" * 70)
    print('\n')

    doc_list = []
    for i in range(len(all_titles)):
        doc_list.append(
            Document(all_titles[i], type=" ".join(all_titles[i].split())))

    m = Model(documents=doc_list, weight=TFIDF)

    cos_similarities = [(m.similarity(x, y), m.documents[i].type,
                         m.documents[j].type)
                        for i, x in enumerate(m.documents)
                        for j, y in enumerate(m.documents) if i != j]

    unique_cos_sim = [
        tuple(x) for x in set(map(frozenset, cos_similarities))
        if len(tuple(x)) == 3
    ]

    resorted_cos_sim_ttl = []
    for i in range(len(unique_cos_sim)):
        resorted_cos_sim_ttl.append(
            sorted(tuple(str(e) for e in unique_cos_sim[i])))
        resorted_cos_sim_ttl[i][0] = float(resorted_cos_sim_ttl[i][0])
        resorted_cos_sim_ttl[i] = tuple(resorted_cos_sim_ttl[i])

    print(
        'The number of calculated book title cosine similarity scores is: {} \n'
        .format(len(resorted_cos_sim_ttl)))

    print(
        'All non-zero book title cosine similarity scores, from smallest to largest: \n'
    )
    for tup in sorted(resorted_cos_sim_ttl):
        if tup[0] != 0:
            print(tup[0])
    print('\n')

    print("~" * 70)

    #print with text wrapping
    format = '%s'

    pieces = [
        format % (sim, ) for sim in sorted(
            resorted_cos_sim_ttl, key=lambda t: t[0], reverse=True)[:5]
    ]
    output = ' | '.join(pieces)
    sims = fill(output)
    print(
        'The cosine similarity scores of the five most similar book titles are: \n'
    )
    print('\n\n'.join(sims.split('|')))
    print('\n')

    print("~" * 70)

    pieces = [
        format % (sim, ) for sim in sorted(
            resorted_cos_sim_ttl, key=lambda t: t[0], reverse=False)[:5]
    ]
    output = ' | '.join(pieces)
    sims = fill(output)
    print(
        'The cosine similarity scores of the five most dissimilar book titles are: \n'
    )
    print('\n\n'.join(sims.split('|')))
    print('\n')

    #############################################################################################
    print(
        'QUESTION 1, Part III: Most Similar and Dissimilar Book Titles and Search Rankings'
    )
    print("-" * 70)
    print('\n')

    print('The most similar pair of book titles is: \n')
    print(max(resorted_cos_sim_ttl))
    print('\n')

    print('The most dissimilar pair of book titles is: \n')
    print(min(resorted_cos_sim_ttl))
    print('\n')

    print("~" * 70)

    doc_types = [doc.type for doc in m.documents]

    print(
        'The search ranking of the first element of the most similar book title pair is: \n'
    )
    print(doc_types.index(max(resorted_cos_sim_ttl)[1]))
    print('\n')

    print(
        'The search ranking of the second element of the most similar book title pair is: \n'
    )
    print(doc_types.index(max(resorted_cos_sim_ttl)[2]))
    print('\n')

    print(
        'The search ranking of the first element of the most dissimilar book title pair is: \n'
    )
    print(doc_types.index(min(resorted_cos_sim_ttl)[1]))
    print('\n')

    print(
        'The search ranking of the second element of the most dissimilar book title pair is: \n'
    )
    print(doc_types.index(min(resorted_cos_sim_ttl)[2]))
    print('\n')

    #############################################################################################
    print('QUESTION 2, Part I: Web Crawling: Extraction of Search Capsules')
    print("-" * 70)
    print('\n')

    orig_query = 'Ponderings XII–XV: Black Notebooks 1939–1941 (Studies in Continental Thought)'

    level_1_url = "https://www.google.com/search?q=" + orig_query.replace(
        ' ', '+')

    all_capsules = get_capsules(level_1_url)

    all_capsules_clean = []
    for cp in all_capsules:
        all_capsules_clean.append(
            unicodedata.normalize('NFKD', cp).encode('ascii',
                                                     'ignore').decode('utf-8'))

    #print with text wrapping
    format = '%s'

    pieces = [format % (cap) for cap in all_capsules_clean]
    output = ' | '.join(pieces)
    caps = fill(output)
    print('The scraped capsules are:')
    print("_" * 40)
    print('\n')
    print('\n\n'.join(caps.split('|')))
    print('\n')

    ##############################################################################################
    print(
        'QUESTION 2, Part II: Pairwise Text Cosine Similarity Scores of Search Capsules'
    )
    print("-" * 70)
    print('\n')

    query_list = []
    for i in range(len(all_capsules_clean)):
        query_list.append(
            Document(all_capsules_clean[i],
                     type=" ".join(all_capsules_clean[i].split())))

    m = Model(documents=query_list, weight=TFIDF)

    cos_similarities = [(m.similarity(x, y), m.documents[i].type,
                         m.documents[j].type)
                        for i, x in enumerate(m.documents)
                        for j, y in enumerate(m.documents) if i != j]

    unique_cos_sim = [
        tuple(x) for x in set(map(frozenset, cos_similarities))
        if len(tuple(x)) == 3
    ]

    resorted_cos_sim_caps = []
    for i in range(len(unique_cos_sim)):
        resorted_cos_sim_caps.append(
            sorted(tuple(str(e) for e in unique_cos_sim[i])))
        resorted_cos_sim_caps[i][0] = float(resorted_cos_sim_caps[i][0])
        resorted_cos_sim_caps[i] = tuple(resorted_cos_sim_caps[i])

    print(
        'The number of calculated capsule cosine similarity scores is: {} \n'.
        format(len(resorted_cos_sim_caps)))

    print(
        'All non-zero capsule cosine similarity scores, from smallest to largest: \n'
    )
    for tup in sorted(resorted_cos_sim_caps):
        if tup[0] != 0:
            print(tup[0])
    print('\n')

    print("~" * 70)

    #print with text wrapping
    format = '%s'

    pieces = [
        format % (sim, ) for sim in sorted(
            resorted_cos_sim_caps, key=lambda t: t[0], reverse=True)[:5]
    ]
    output = ' | '.join(pieces)
    sims = fill(output)
    print(
        'The Cosine Similarity scores of the five most similar capsule pairs are: \n'
    )
    print('\n\n'.join(sims.split('|')))
    print('\n')

    print("~" * 70)

    pieces = [
        format % (sim, ) for sim in sorted(
            resorted_cos_sim_caps, key=lambda t: t[0], reverse=False)[:5]
    ]
    output = ' | '.join(pieces)
    sims = fill(output)
    print(
        'The Cosine Similarity scores of the five most dissimilar capsule pairs are: \n'
    )
    print('\n\n'.join(sims.split('|')))
    print('\n')

    print("~" * 70)

    print(
        'Finding the capsule with the highest cosine similarity to the original query... \n'
    )
    all_capsules_clean.append(orig_query)

    caps_and_query = []
    for i in range(len(all_capsules_clean)):
        caps_and_query.append(
            Document(all_capsules_clean[i],
                     type=" ".join(all_capsules_clean[i].split())))

    m = Model(documents=caps_and_query, weight=TFIDF)

    cos_similarities = [(m.similarity(x, y), m.documents[i].type,
                         m.documents[j].type)
                        for i, x in enumerate(m.documents)
                        for j, y in enumerate(m.documents) if i != j]

    unique_cos_sim_query = [
        tuple(x) for x in set(map(frozenset, cos_similarities))
        if len(tuple(x)) == 3
    ]

    resorted_cos_sim_query = []
    for i in range(len(unique_cos_sim_query)):
        resorted_cos_sim_query.append(
            sorted(tuple(str(e) for e in unique_cos_sim_query[i])))
        resorted_cos_sim_query[i][0] = float(resorted_cos_sim_query[i][0])
        resorted_cos_sim_query[i] = tuple(resorted_cos_sim_query[i])

    result_list = []
    for tup in resorted_cos_sim_query:
        if orig_query in tup:
            result_list.append(tup)

    result_tup = max(result_list, key=lambda x: x[0])
    print(
        'The cosine similarity score of the capsule most similar to the original query is: \n'
    )
    print(result_tup)
    print('\n')

    print(
        'Finding search ranking of the capsule with the highest cosine similarity to the original query... \n'
    )

    match_list = []
    for item in all_capsules_clean:
        match_list.append(item.replace('\n', ''))

    print(
        'The search ranking of the capsule most similar to the original query is: \n'
    )
    print(match_list.index(result_tup[1]))
    print('\n')

    #############################################################################################
    print(
        'QUESTION 2, Part III: Most Similar and Dissimilar Capsules and Search Rankings'
    )
    print("-" * 70)
    print('\n')

    print('The most similar pair of capsules is: \n')
    print(max(resorted_cos_sim_caps))
    print('\n')

    print('The most dissimilar pair of capsules is: \n')
    print(min(resorted_cos_sim_caps))
    print('\n')

    print("~" * 70)

    doc_types = [doc.type for doc in m.documents]

    print(
        'The search ranking of the first element of the most similar capsule pair is: \n'
    )
    print(doc_types.index(max(resorted_cos_sim_caps)[1]))
    print('\n')

    print(
        'The search ranking of the second element of the most similar capsule pair is: \n'
    )
    print(doc_types.index(max(resorted_cos_sim_caps)[2]))
    print('\n')

    print(
        'The search ranking of the first element of the most dissimilar capsule pair is: \n'
    )
    print(doc_types.index(min(resorted_cos_sim_caps)[1]))
    print('\n')

    print(
        'The search ranking of the second element of the most dissimilar capsule pair is: \n'
    )
    print(doc_types.index(min(resorted_cos_sim_caps)[2]))
    print('\n')

    ############################################################################################

    print('Summary Report: Document Similarity Semantic Analysis')
    print("-" * 70)
    ################
    report = "A crawler with changing user-agent headers was used to scrape book titles on Amazon from the first two pages of results returned when searching the philosopher, Martin Heidegger. Using TF-IDF values derived from a model incorporating the scraped results, all pairwise cosine similarity scores were calculated for the corpus documents, each of which consisted of the book title and any accompanying subtitle text. The scores were filtered for unique book title pairs and sorted by ascending cosine similarity score, so the top 5 and bottom 5 pairs could be printed in terminal. As several pairings returned a cosine similarity score of 0, the most dissimilar pair among the lowest scores could not be decisively quantified. Interestingly, search rankings of the elements of the most similar and dissimilar pairs did not appear on the same page of results. Another crawler was used to scrape capsules returned by a Google search for one of the book titles appearing in the Amazon results. Capsules from the first three pages of Google results were Unicode normalized and decoded before they were incorporated into another model, from which TF-IDF values were derived. All pairwise cosine similarity scores were calculated for the new set of corpus documents, which consisted of all text appearing in each capsule. Scores were filtered for unique capsule pairs and sorted by ascending cosine similarity score; the top 5 and bottom 5 pairs were again printed in terminal. To identify the capsule most similar to the original query, the latter was then included in the model, from which a new set of TF-IDF values and cosine similarity scores were generated. Interestingly, the ranking of the most similar capsule appeared lower in the search results than expected, on the bottom of the second page. Intuitively, the search rankings of the capsules most similar to one another did, however, appear on the same page of Google results."
    ##############
    format = '%s'
    pieces = [format % (word) for word in report]
    output = ''.join(pieces)
    write_up = fill(output)
    print(write_up)

    return None
Esempio n. 16
0
# to represent this.

# A Model is a collection of documents vectors.
# A Model is a matrix (or vector space)
# with features as columns and feature weights as rows.
# We can then do calculations on the matrix,
# for example to compute TF-IDF or similarity between documents.

# Load a model from a folder of text documents:
documents = []
for f in glob.glob(os.path.join(os.path.dirname(__file__), "corpus", "*.txt")):
    text = codecs.open(f, encoding="utf-8").read()
    name = os.path.basename(f)[:-4]
    documents.append(Document(text, name=name))

m = Model(documents, weight=TFIDF)

# We can retrieve documents by name:
d = m.document(name="lion")

print d.keywords(top=10)
print
print d.tf("food")
print d.tfidf(
    "food")  # TF-IDF is less: "food" is also mentioned with the other animals.
print

# We can compare how similar two documents are.
# This is done by calculating the distance between the document vectors
# (i.e., finding those that are near to each other).
# the weights will be between 0.0-1.0 (their sum is 1.0).
print document.copy()
# document vector
v1 = Vector({"curiosity": 1, "kill": 1, "cat": 1})
v2 = Vector({"curiosity": 1, "explore": 1, "mars": 1})
print 1 - distance(v1, v2)
# model
d1 = Document('A tiger is a big yellow cat with stripes.', type='tiger')
d2 = Document(
    'A lion is a big yellow cat with manes.',
    type='lion',
)
d3 = Document('An elephant is a big grey animal with a slurf.',
              type='elephant')
print d1.vector
m = Model(documents=[d1, d2, d3], weight=TFIDF)
print d1.vector
print m.similarity(d1, d2)  # tiger vs. lion
print m.similarity(d1, d3)  # tiger vs. elephant
# lsa concept space
d1 = Document('The cat purrs.', name='cat1')
d2 = Document('Curiosity killed the cat.', name='cat2')
d3 = Document('The dog wags his tail.', name='dog1')
d4 = Document('The dog is happy.', name='dog2')
m = Model([d1, d2, d3, d4])
m.reduce(2)
for d in m.documents:
    print
    print d.name
    for concept, w1 in m.lsa.vectors[d.id].items():
        for feature, w2 in m.lsa.concepts[concept].items():
Esempio n. 18
0
# -*- coding: utf-8 -*-

from json import load
from pattern.vector import Document, Model,L2

packages = load(file("packages.json"))

docs = [Document(p['description'], name=p['name']) for p in packages]
model = Model(docs)

lsa = model.reduce(L2)
Esempio n. 19
0
r2_list = []
for query in r2_queries:
    for r2 in r2_exhibits.get_by_example(query):
        try:
            strings = [r2['program_desc']]
            projects = [r2['projects'] for k in r2['projects'].keys()]
            for proj in projects:
                try:
                    strings.append(proj['mission_desc'])
                except KeyError as e:
                    pass
            doc = Document(" ".join(strings), name=r2['_id'])
            r2_list.append(doc)
        except KeyError as e:
            print repr(e) # not much to do about this
m = Model(r2_list)

def r2iterator_to_model(collection, query):
    r2_list = []
    for r2 in collection.get_by_example(query):
        try:
            strings = [r2['program_desc']]
            for projid in r2['projects'].keys():
                try:
                    strings.append(r2['projects'][projid]['mission_desc'])
                except KeyError:
                    pass
            try:
                doc = Document(" ".join(strings), name=r2['_id'])
                r2_list.append(doc)
            except TypeError as e:
Esempio n. 20
0
from pattern.vector import Document, Model

d1 = Document('The cat purrs.', name='cat1')
d2 = Document('Curiosity killed the cat.', name='cat2')
d3 = Document('The dog wags his tail.', name='dog1')
d4 = Document('The dog is happy.', name='dog2')

m = Model([d1, d2, d3, d4])
m.reduce(2)
 
for d in m.documents:
    print
    print d.name
    for concept, w1 in m.lsa.vectors[d.id].items():
        for feature, w2 in m.lsa.concepts[concept].items():
            if w1!=0 and w2!=0:
                print (feature, w1 * w2)
Esempio n. 21
0
def get_results(query, quantity, force=False, news=False, analysis=True):
    query = query.lower()
    start = datetime.now()

    query = query.replace('_', '%20')
    breakdown = 50

    if breakdown > quantity:
        breakdown = quantity

    data_to_be_written = []
    knowledgeKeywords = []
    duplicates = []

    results, created = webSearch.objects.get_or_create(queryText=query.strip())
    if created or force or len(results.results.all()) < quantity:
        all_results = getGoogleResults(query, quantity, news, force)
    else:
        all_results = []

    if len(all_results) == 0 and not created:
        all_results = [r.url for r in results.results.all()]

    all_results = all_results[:quantity]
    print "TOTAL RESULTS ", str(len(all_results))
    # Done with getting search results

    for index, i in enumerate(all_results):
        try:
            wr, created = WebResource.objects.get_or_create(url=i)
            if created:
                wr = parseURL(i, True)
            data = {'url': i}
            keywords = [
                w for w in count(wr.text, top=10, stemmer=LEMMA)
                if w not in stop
            ]

            if 'books.google' in i:
                text = ''
            else:
                text = wr.text

            data.update({
                'keywords': keywords,
                'text': plaintext(text),
                'title': wr.title,
                'urls': wr.urls,
                'type': 'result',
                'index': index + 1,
                'similar': [],
                'duplicates': [],
                'category': 0,
            })

            if wr not in results.results.all():
                results.results.add(wr)

            data['plaintext'] = data['text'].split('\n')

            # while '' in data['plaintext']:
            # 	data['plaintext'].remove('')

            # knowledgeKeywords.extend(data['keywords'])

            data_to_be_written.append(data)
        except Exception as e:
            print e

    print "Response Result model Prepared"

    if not analysis:
        return data_to_be_written

    list_of_sim_docs, model, m = find_similarity(data_to_be_written)
    for i in list_of_sim_docs:
        similar = {
            'type': 'similar',
            's': i.get('source'),
            'd': i.get('dest'),
            'source': i.get('source'),
            'dest': i.get('dest'),
            'score': i.get('score'),
        }
        data_to_be_written.append(similar)

        if similar['score'] > 0.9:
            for res in data_to_be_written:
                if res['type'] in [
                        'result', 'duplicate'
                ] and res['url'] == i.get('dest') and len(res['text']) > 0:
                    print "Duplicate [{0}].[{1}]".format(
                        i['source'][:20], i['dest'][:20])
                    res['type'] = 'duplicate'

    items = [
        Document(i.get('text'),
                 name=i.get('url'),
                 description=i.get('index'),
                 stemmer=LEMMA) for i in data_to_be_written
    ]
    m = Model(items, weight=TFIDF)

    # k = 10
    ####### BEGIN Experimental Setup ##########

    # v,d = m.features, m.documents
    # y,x = len(m.documents),len(m.features)

    def build_matrix(w=None, d=None):
        y, x = len(d), len(w)
        model = np.zeros((y, x))

        for i in range(y):
            model[i] = [1 if w[j] in d[i].words else 0 for j in range(x)]

        return model

    # def find_word_matches(model, words = None, d = None):
    # 	y,x = model.shape
    # 	for i in range(y):
    # 		for j in range(i+1,y):
    # 			a = np.copy(model[i])
    # 			b = np.copy(model[j])

    # 			a_ones = np.count_nonzero(a)
    # 			b_ones = np.count_nonzero(b)

    # 			comparison = (a==b)

    # 			cross_product = a*b
    # 			intersection = np.count_nonzero(cross_product)
    # 			union = a_ones+b_ones-intersection

    # 			if a_ones+b_ones>0 and intersection > 0:
    # 				score = intersection/union
    # 			else:
    # 				score = 0

    # 			if model[i].any() and model[j].any() and comparison.any() and score > 0.4:
    # 				print "Match [{0}] {1}:[{2} words] - [{3}] {4}:[{5} words] : {6} words".format(d[i].description,d[i].name[:30], np.count_nonzero(a), d[j].description,d[j].name[:30], np.count_nonzero(b), score, math.fabs(d[i].description - d[j].description))
    # 				similar = {
    # 					'type' : 'similar',
    # 					'source' : d[i].name,
    # 					'dest' : d[j].name,
    # 					'score' : score,
    # 				}
    # 				data_to_be_written.append(similar)

    # 			if score >= 0.9:
    # 				for res in data_to_be_written:
    # 					if res['type'] in ['result','duplicate'] and res['url'] == d[j].name and len(res['text'])>0:
    # 						print "Duplicate [{0}].[{1}]".format(i+1,j+1)
    # 						res['type'] = 'duplicate'
    # 	return model

    def word_frequency(model,
                       words=None,
                       documents=None,
                       threshold1=0,
                       threshold2=1,
                       transpose=False):
        "Returns frequent word amoung documents in range of threshold"
        y, x = model.shape
        data = {}

        for i in range(x):
            count = np.count_nonzero(model[:, i]) / y
            if count >= threshold1 and count <= threshold2:
                if words:
                    data[words[i]] = count
                else:
                    data[i] = count
        return data

    model = build_matrix(m.features, m.documents)
    # model = find_word_matches(model, m.features, m.documents)
    knowledgeKeywords = [
        w for w in word_frequency(model, m.features, m.documents, 0.2, 0.8)
    ][:20]

    ####### END Experimental Setup ##########

    # c = m.cluster(method=HIERARCHICAL, k=k)
    # for i in c:
    # 	cluster = []
    # 	k = []
    # 	contains_text = False

    # 	for item in i:
    # 		for data in data_to_be_written:
    # 			if data.get('type') == 'result' and data.get('url')==item.name:
    # 				cluster.append({
    # 					'url' : data.get('url'),
    # 					'index' : item.description,
    # 					})
    # 				if data.get('text'):
    # 					k.extend([w for w in count(words(data.get('text')), top=50, stemmer = PORTER, exclude=[], stopwords=False, language='en')])
    # 					contains_text=True
    # 	cluster = {
    # 		'type' : 'cluster',
    # 		'data' : cluster,
    # 		'index' : min([c.get('index') for c in cluster] + [0]),
    # 		'keywords' : [w for w in count(k, top=10, stemmer = PORTER, exclude=[], stopwords=False, language='en')]
    # 	}

    # 	cluster['contains_text'] = contains_text

    # 	data_to_be_written.append(cluster)

    # print "{0} results".format(len(data_to_be_written))
    data_to_be_written.append({
        'type': 'meta',
        'keywords': knowledgeKeywords,
    })

    result = {}
    for i in data_to_be_written:
        if i.get('type') in ['result', 'duplicate']:
            url = i.get('url')
            index = int(i.get('index'))

            result[index] = [
                1 for r in data_to_be_written
                if r.get('type') == 'similar' and r['source'] == url
            ]

    result2 = [i for i, j in result.iteritems()]
    result3 = [len(j) for i, j in result.iteritems()]

    Process(target=plot_graph, args=(result2, result3)).start()

    return data_to_be_written
Esempio n. 22
0
import cPickle as pickle

con = pymongo.MongoClient()
sentiment_res = con.tweets.sentiment_analysis
sentiment_res_p = con.tweets.patterns_sentiment_analysis
tweets = con.tweets.tweets_toronto

docs = []
# with open('D:\\data\\documents.spkl', 'wb') as fp:
#     for tweet in tweets.find():
#         doc = Document(tweet['text'],name=tweet['id'])
#         pickle.dump(doc, fp)
#     fp.close()
#

m = Model(documents=[], weight=TFIDF)

with open('D:\\data\\documents.spkl', 'rb') as fp:
    for j in range(tweets.count() / 100):
        print 'Loading model'
        m.append(pickle.load(fp))
        print len(m.documents)
with open('D:\\data\\documents.spkl', 'rb') as fp:
    for j in xrange(tweets.count()):
        print 'Loading model'
        m.append(pickle.load(fp))
        print len(m.documents)
    print len(m.documents)
m.reduce(dimensions=L2)
m.save
Esempio n. 23
0
from pattern.en import Sentence, parse
from pattern.search import search
from pattern.vector import Document, Model, KNN

# Classification is a supervised machine learning method,
# where labeled documents are used as training material
# to learn how to label unlabeled documents.

# This example trains a simple classifier with Twitter messages.
# The idea is that, if you have a number of texts with a "type"
# (mail/spam, positive/negative, language, author's age, ...),
# you can predict the type of other "unknown" texts.
# The k-Nearest Neighbor algorithm classifies texts according
# to the k documents that are most similar (cosine similarity) to the given input document.

m = Model()
t = Twitter()

# First, we mine a model of a 1000 tweets.
# We'll use hashtags as type.
for page in range(1, 10):
    for tweet in t.search('#win OR #fail', start=page, count=100, cached=True):
        # If the tweet contains #win hashtag, we'll set its type to 'WIN':
        s = tweet.text.lower()  # tweet in lowercase
        p = '#win' in s and 'WIN' or 'FAIL'  # document labels
        s = Sentence(parse(s))  # parse tree with part-of-speech tags
        s = search('JJ', s)  # adjectives in the tweet
        s = [match[0].string for match in s]  # adjectives as a list of strings
        s = " ".join(s)  # adjectives as string
        if len(s) > 0:
            m.append(Document(s, type=p, stemmer=None))
Esempio n. 24
0
def recommend_game(this_game):
    games = recommendable_games(this_game)

    total_recommendable = games.count()
    print 'Total recommendable games based on ' + this_game.title + ": " + total_recommendable.__str__()

    document_title = Document(this_game.title)
    document_publisher = Document(this_game.publisher)
    document_summary = Document(this_game.summary,
                                top=None,
                                threshold=0,
                                stemmer=None,
                                exclude=[],
                                stopwords=False,
                                language='en')
    document_keywords = Document(', '.join([x['name'] for x in this_game.keywords.all().values("name")]))
    document_genres = Document(', '.join([x['name'] for x in this_game.genres.all().values("name")]))

    # format: {"id":id, socre:"SUM(dist*pond)"}
    game_similarities = []
    summary_documents = []
    for game in games:
        score = 0
        game = Game.objects.filter(title=game['title'], platform=game['platform'])[0]

        title_similarity = 1 - distance(document_title.vector, Document(game.title).vector)
        publisher_similarity = 1 - distance(document_publisher.vector, Document(game.publisher).vector)
        genre_similarity = 1 - distance(document_genres.vector, Document(
            ', '.join([x['name'] for x in game.genres.all().values("name")])
        ).vector)
        keywords_similarity = 1 - distance(document_keywords.vector, Document(
            ', '.join([x['name'] for x in game.keywords.all().values("name")])
        ).vector)

        score = (0.15 * title_similarity) + (0.2 * genre_similarity) + (0.2 * publisher_similarity) + (
            0.20 * keywords_similarity)

        summary_documents.append(Document(game.summary,
                                          top=None,
                                          threshold=0,
                                          stemmer=None,
                                          exclude=[],
                                          stopwords=False,
                                          language='en',
                                          name=game.id))

        game_similarities.append({"id": game.id, "score": score})

    to_compare = Document(document_summary)

    model = Model(documents=summary_documents, weight=TFIDF)

    neighbours = model.neighbors(to_compare, top=total_recommendable)

    for neighbour in neighbours:
        for rec_game in game_similarities:
            if rec_game['id'] == neighbour[1].name:
                rec_game['score'] = rec_game['score'] + 0.25 * neighbour[0]

    recommended = sorted(game_similarities, key=lambda k: -k['score'])[0:total_recommendable]

    if len(recommended) >= 40:
        random_selection = random.sample(recommended[0:40], 25)
    else:
        random_selection = random.sample(recommended, 25)

    recommended_ids = [g['id'] for g in random_selection]

    return recommended_ids
Esempio n. 25
0
# but is is still popular because it is fast for models
# that have many documents and many features.
# It is outperformed by KNN and SVM, but useful as a baseline for tests.

# We'll test it with a corpus of spam e-mail messages,
# included in the test suite, stored as a CSV-file.
# The corpus contains mostly technical e-mail from developer mailing lists.
data = os.path.join(os.path.dirname(__file__), "..", "..", "test", "corpora",
                    "spam-apache.csv")
data = Datasheet.load(data)

documents = []
for score, message in data:
    document = Document(message, type=int(score) > 0)
    documents.append(document)
m = Model(documents)

print("number of documents:", len(m))
print("number of words:", len(m.vector))
print("number of words (average):",
      sum(len(d.features) for d in m.documents) / float(len(m)))
print()

# Train Naive Bayes on all documents.
# Each document has a type: True for actual e-mail, False for spam.
# This results in a "binary" classifier that either answers True or False
# for unknown documents.
classifier = NB()
for document in m:
    classifier.train(document)
 def crearModelo(self, listaDocumentos):
     '''Crear modelo de listas de documentos utilizando calculo de frencuencias TFIDF'''
     return Model(listaDocumentos, weight=TFIDF)