Example #1
0
    def topics_reduction(self, desired_number):
        if desired_number >= len(self.docs_in_topic) - 1:
            pass
        else:
            print(
                f"Topics found: {len(self.docs_in_topic) - 1}. Compressing...")
            topics_to_merge = len(self.docs_in_topic) - desired_number - 1
            for _ in range(topics_to_merge):
                # cosine similarities of topics
                similarities = cos_sim(self.ctfidf, self.ctfidf).numpy()
                np.fill_diagonal(similarities, 0)

                # merge topics
                topics_no_other = self.count_documents()
                del topics_no_other[-1]
                topic_to_merge = sorted(topics_no_other,
                                        key=lambda x: x[1])[0][0]  # smallest
                topic_to_merge_into = (similarities[topics_to_merge].argmax() -
                                       1)  # most similar
                self.docs_in_topic[topic_to_merge_into] = (
                    self.docs_in_topic[topic_to_merge_into] +
                    self.docs_in_topic[topic_to_merge])
                del self.docs_in_topic[topic_to_merge]
                self.docs_in_topic = {
                    k - 1: v
                    for k, v in zip(
                        range(len(self.docs_in_topic)),
                        self.docs_in_topic.values(),
                    )
                }

                # calculate new ctfidf
                self._words, self.ctfidf = self.calculate_cTFIDF()
            print(f"Final number of topics: {len(self.docs_in_topic) - 1}")
Example #2
0
    def test_simple_encode(self):
        # Encode an image:
        image_filepath = os.path.join(
            os.path.dirname(os.path.realpath(__file__)),
            "../examples/applications/image-search/two_dogs_in_snow.jpg")
        print(image_filepath)
        img_emb = self.model.encode(Image.open(image_filepath))

        # Encode text descriptions
        text_emb = self.model.encode([
            'Two dogs in the snow', 'A cat on a table',
            'A picture of London at night'
        ])

        # Compute cosine similarities
        cos_scores = util.cos_sim(img_emb, text_emb)[0]
        assert abs(cos_scores[0] - 0.3069) < 0.01
        assert abs(cos_scores[1] - 0.1010) < 0.01
        assert abs(cos_scores[2] - 0.1086) < 0.01
Example #3
0
document = """
New York City (NYC), often called simply New York, is the most populous city in the United States. With an estimated 2019 population of 8,336,817 distributed over about 302.6 square miles (784 km2), New York City is also the most densely populated major city in the United States. Located at the southern tip of the U.S. state of New York, the city is the center of the New York metropolitan area, the largest metropolitan area in the world by urban landmass. With almost 20 million people in its metropolitan statistical area and approximately 23 million in its combined statistical area, it is one of the world's most populous megacities. New York City has been described as the cultural, financial, and media capital of the world, significantly influencing commerce, entertainment, research, technology, education, politics, tourism, art, fashion, and sports. Home to the headquarters of the United Nations, New York is an important center for international diplomacy.

Situated on one of the world's largest natural harbors, New York City is composed of five boroughs, each of which is a county of the State of New York. The five boroughs—Brooklyn, Queens, Manhattan, the Bronx, and Staten Island—were consolidated into a single city in 1898. The city and its metropolitan area constitute the premier gateway for legal immigration to the United States. As many as 800 languages are spoken in New York, making it the most linguistically diverse city in the world. New York is home to more than 3.2 million residents born outside the United States, the largest foreign-born population of any city in the world as of 2016. As of 2019, the New York metropolitan area is estimated to produce a gross metropolitan product (GMP) of $2.0 trillion. If the New York metropolitan area were a sovereign state, it would have the eighth-largest economy in the world. New York is home to the highest number of billionaires of any city in the world.

New York City traces its origins to a trading post founded by colonists from the Dutch Republic in 1624 on Lower Manhattan; the post was named New Amsterdam in 1626. The city and its surroundings came under English control in 1664 and were renamed New York after King Charles II of England granted the lands to his brother, the Duke of York. The city was regained by the Dutch in July 1673 and was subsequently renamed New Orange for one year and three months; the city has been continuously named New York since November 1674. New York City was the capital of the United States from 1785 until 1790, and has been the largest U.S. city since 1790. The Statue of Liberty greeted millions of immigrants as they came to the U.S. by ship in the late 19th and early 20th centuries, and is a symbol of the U.S. and its ideals of liberty and peace. In the 21st century, New York has emerged as a global node of creativity, entrepreneurship, and environmental sustainability, and as a symbol of freedom and cultural diversity. In 2019, New York was voted the greatest city in the world per a survey of over 30,000 people from 48 cities worldwide, citing its cultural diversity.

Many districts and landmarks in New York City are well known, including three of the world's ten most visited tourist attractions in 2013. A record 62.8 million tourists visited New York City in 2017. Times Square is the brightly illuminated hub of the Broadway Theater District, one of the world's busiest pedestrian intersections, and a major center of the world's entertainment industry. Many of the city's landmarks, skyscrapers, and parks are known around the world. Manhattan's real estate market is among the most expensive in the world. Providing continuous 24/7 service and contributing to the nickname The City that Never Sleeps, the New York City Subway is the largest single-operator rapid transit system worldwide, with 472 rail stations. The city has over 120 colleges and universities, including Columbia University, New York University, Rockefeller University, and the City University of New York system, which is the largest urban public university system in the United States. Anchored by Wall Street in the Financial District of Lower Manhattan, New York City has been called both the world's leading financial center and the most financially powerful city in the world, and is home to the world's two largest stock exchanges by total market capitalization, the New York Stock Exchange and NASDAQ.
"""

#Split the document into sentences
sentences = nltk.sent_tokenize(document)
print("Num sentences:", len(sentences))

#Compute the sentence embeddings
embeddings = model.encode(sentences, convert_to_tensor=True)

#Compute the pair-wise cosine similarities
cos_scores = util.cos_sim(embeddings, embeddings).numpy()

#Compute the centrality for each sentence
centrality_scores = degree_centrality_scores(cos_scores, threshold=None)

#We argsort so that the first element is the sentence with the highest score
most_central_sentence_indices = np.argsort(-centrality_scores)

#Print the 5 sentences with the highest scores
print("\n\nSummary:")
for idx in most_central_sentence_indices[0:5]:
    print(sentences[idx].strip())
Example #4
0
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")



inputs = processor(texts=["a cat", "a dog"], images=[image], return_tensors="pt", padding=True)
output = model(**inputs)
#vision_outputs = model.vision_model(pixel_values=inputs['pixel_values'])
#image_embeds = model.visual_projection(vision_outputs[1])

#print(image_embeds.shape)
#exit()



#Load CLIP model
clip = models.CLIPModel()
model = SentenceTransformer(modules=[clip])

model.save('tmp-clip-model')

model = SentenceTransformer('tmp-clip-model')

#Encode an image:
img_emb = model.encode(Image.open('two_dogs_in_snow.jpg'))

#Encode text descriptions
text_emb = model.encode(['Two dogs in the snow', 'A cat on a table', 'A picture of London at night'])

#Compute cosine similarities
cos_scores = util.cos_sim(img_emb, text_emb)
print(cos_scores)
Example #5
0
          'A monkey is playing drums.',
          'A cheetah is running behind its prey.'
          ]
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

# Query sentences:
queries = ['A man is eating pasta.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah chases prey on across a field.']


# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(5, len(corpus))
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(corpus[idx], "(Score: {:.4f})".format(score))

    """
    # Alternatively, we can also use util.semantic_search to perform cosine similarty + topk
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=5)
    hits = hits[0]      #Get the hits for the first query
    for hit in hits:
        print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
Example #6
0
logging.info("Encoding unique sentences with semantic search model: {}".format(
    semantic_model_name))

# encoding all unique sentences present in the training dataset
embeddings = semantic_search_model.encode(sentences,
                                          batch_size=batch_size,
                                          convert_to_tensor=True)

logging.info("Retrieve top-{} with semantic search model: {}".format(
    top_k, semantic_model_name))

# retrieving top-k sentences given a sentence from the dataset
progress = tqdm.tqdm(unit="docs", total=len(sent2idx))
for idx in range(len(sentences)):
    sentence_embedding = embeddings[idx]
    cos_scores = util.cos_sim(sentence_embedding, embeddings)[0]
    cos_scores = cos_scores.cpu()
    progress.update(1)

    #We use torch.topk to find the highest 5 scores
    top_results = torch.topk(cos_scores, k=top_k + 1)

    for score, iid in zip(top_results[0], top_results[1]):
        if iid != idx and (iid, idx) not in duplicates:
            silver_data.append((sentences[idx], sentences[iid]))
            duplicates.add((idx, iid))

progress.reset()
progress.close()

logging.info("Length of silver_dataset generated: {}".format(len(silver_data)))