コード例 #1
0
    def setUp(self):
        self.first_index = hashedindex.HashedIndex()
        self.first_index.add_term_occurrence('foo', 'document2.txt')
        self.first_index.add_term_occurrence('foo', 'document1.txt')

        self.second_index = hashedindex.HashedIndex()
        self.second_index.add_term_occurrence('foo', 'document1.txt')
        self.second_index.add_term_occurrence('bar', 'document9.txt')
コード例 #2
0
 def create_index(self, field_to_index: str) -> None:
     index = hashedindex.HashedIndex()
     file_name = f"{DB_ROOT}/{self.name}.json"
     data_table = read_file(file_name)
     for i, j in data_table.items():
         if field_to_index in j:
             index.add_term_occurrence(j[field_to_index], i)
コード例 #3
0
    def test_from_dict(self):
        index2 = hashedindex.HashedIndex()
        index2.from_dict({
            'documents': {
                'a': 2,
                'b': 3
            },
            # Does not test for validity
            'terms': {
                'foo': {
                    'a': 20,
                    'b': 40
                },
                'bar': {
                    'a': 65,
                    'b': 2
                },
            }
        })

        assert unordered_list_cmp(index2.terms(), ['foo', 'bar'])
        assert unordered_list_cmp(index2.documents(), ['a', 'b'])
        assert index2.get_documents('foo') == collections.Counter({
            'a': 20,
            'b': 40
        })
        assert index2.get_documents('bar') == collections.Counter({
            'a': 65,
            'b': 2
        })
コード例 #4
0
def ext_tittle_index(docs):
    '''
  this function will return original postinglist of title terms
  in the form {docid : {term : tf}} for all docs
  '''
    index = hashedindex.HashedIndex()
    for (k, v) in docs.items():
        index.add_term_occurrence(k, v)
    return index
コード例 #5
0
    def make_posting(self, docs):
        #inverted index creation

        indexlist = hashedindex.HashedIndex()
        for (i, j) in docs.items():
            for tokens in nltk.word_tokenize(j):
                if tokens not in string.punctuation:
                    indexlist.add_term_occurrence(tokens, i)
        return indexlist
コード例 #6
0
    def make_bigram_Improvement(self, docs):
        #improvement made using bigram index

        indices = hashedindex.HashedIndex()
        for (i, j) in docs.items():
            tokens = nltk.word_tokenize(j)
            for bigrams in nltk.ngrams(tokens, 2):
                indices.add_term_occurrence(bigrams, i)
        return indices
def create_bigram_index(docs):
    ''' Function to create bigram-inverted index to facilitate 
  phrasal queries
  '''
    index = hashedindex.HashedIndex()
    for (k, v) in docs.items():
        tokens = nltk.word_tokenize(v)
        for bigrams in nltk.ngrams(tokens, 2):
            index.add_term_occurrence(bigrams, k)
    return index
def create_postinglist(docs):
    ''' Function to create inverted index to facilitate 
  vector space model for document retreival.
  '''
    index = hashedindex.HashedIndex()
    for (k, v) in docs.items():
        for tokens in nltk.word_tokenize(v):
            if tokens not in string.punctuation:
                index.add_term_occurrence(tokens, k)
    return index
コード例 #9
0
    def setUp(self):
        self.index = hashedindex.HashedIndex()

        for i in range(100):
            self.index.add_term_occurrence('word', 'document{}.txt'.format(i))

        for i in range(20):
            self.index.add_term_occurrence('text', 'document{}.txt'.format(i))

        self.index.add_term_occurrence('lonely', 'document2.txt')
コード例 #10
0
    def get_data_from_collection(self):
        if os.path.exists(self.env_dir + self.dir) and len(
                os.listdir(self.env_dir + self.dir)) > 0 and len(
                    self.collection) > 0:
            index = hashedindex.HashedIndex()
            doc_count = 0

            with io.open(self.env_dir + self.dir + self.collection,
                         'r',
                         encoding='utf8') as fp:
                for line in fp.readlines():
                    for term in textparser.word_tokenize(line,
                                                         min_length=2,
                                                         ignore_numeric=True):
                        time.sleep(1)
                        index.add_term_occurrence(
                            term, self.collection + "/line-" + str(doc_count))

                    self.docnames.append(self.collection + "/line-" +
                                         str(doc_count))

                    doc_count = doc_count + 1

            # Esto es una PoC para ver si es que se genera efectivamente una matriz de 1's y 0's con las incidencias
            for doc in self.docnames:
                aux_doc = []
                for term in index.terms():
                    if round(index.get_term_frequency(term, doc)) > 0:
                        aux_doc.append(1)
                    else:
                        aux_doc.append(0)

                self.matrix.append(aux_doc)

            self.matrix = np.matrix(self.matrix)

            # Esto es para crear el array de términos
            for term in index.terms():
                self.terms.append(re.sub("(\(\'|\'\,\))", "", str(term)))

        else:
            print("Attempting to create '{}' into {}.".format(
                self.dir, self.env_dir))

            if not os.path.exists(self.env_dir + self.dir):
                os.mkdir(self.env_dir + self.dir, mode=777)
                print(
                    "The input folder, '{}', was created successfully in {}.".
                    format(self.dir, self.env_dir))

            else:
                print("The input folder, '{}', is empty in {}.".format(
                    self.dir, self.env_dir))

        return self.matrix, self.docnames, self.terms
コード例 #11
0
def make_postinglist(docs):
    '''
  This function creates a posting list of the form {term:{docid:tf}}
  for all the terms in the list of documents

  '''
    inverted_index = hashedindex.HashedIndex()
    for (k, v) in docs.items():
        for tokens in nltk.word_tokenize(v):
            if tokens not in string.punctuation:
                inverted_index.add_term_occurrence(tokens, k)
    return inverted_index
コード例 #12
0
    def get_data_from_input(self):
        if os.path.exists(self.env_dir + self.dir) and len(
                os.listdir(self.env_dir + self.dir)) > 0:
            self.docnames = [
                f for f in listdir(self.env_dir + self.dir)
                if isfile(join(self.env_dir + self.dir, f))
            ]
            index = hashedindex.HashedIndex()

            for doc in self.docnames:
                with io.open(self.env_dir + self.dir + doc,
                             'r',
                             encoding='utf8') as fp:
                    text = re.sub('(\t\n|\t|\n|_)', " ", fp.read())

                    for term in textparser.word_tokenize(text,
                                                         min_length=2,
                                                         ignore_numeric=True):
                        index.add_term_occurrence(term, doc)

            # Esto es una PoC para ver si es que se genera efectivamente una matriz de 1's y 0's con las incidencias
            for doc in self.docnames:
                aux_doc = []
                for term in index.terms():
                    if round(index.get_term_frequency(term, doc)) > 0:
                        aux_doc.append(1)
                    else:
                        aux_doc.append(0)

                self.matrix.append(aux_doc)

            self.matrix = np.matrix(self.matrix)

            # Esto es para crear el array de términos
            for term in index.terms():
                self.terms.append(re.sub("(\(\'|\'\,\))", "", str(term)))

        else:
            print("Attempting to create '{}' into {}.".format(
                self.dir, self.env_dir))

            if not os.path.exists(self.env_dir + self.dir):
                os.mkdir(self.env_dir + self.dir, mode=777)
                print(
                    "The input folder, '{}', was created successfully in {}.".
                    format(self.dir, self.env_dir))

            else:
                print("The input folder, '{}', is empty in {}.".format(
                    self.dir, self.env_dir))

        return self.matrix, self.docnames, self.terms
コード例 #13
0
def get_pldict(documents):
    '''
  This function will give posting list dictionary of the form {docid:{term:tf}
  for all documents

  '''

    pldict = hashedindex.HashedIndex()
    for (k, v) in documents.items():
        for tokens in nltk.word_tokenize(v):
            if tokens not in string.punctuation:
                pldict.add_term_occurrence(k, tokens)

    return pldict
コード例 #14
0
    def setUp(self):
        self.index = hashedindex.HashedIndex()

        for i in range(3):
            self.index.add_term_occurrence('word', 'document1.txt')

        for i in range(5):
            self.index.add_term_occurrence('malta', 'document1.txt')

        for i in range(4):
            self.index.add_term_occurrence('phone', 'document2.txt')

        for i in range(2):
            self.index.add_term_occurrence('word', 'document2.txt')
コード例 #15
0
def ext_title_from(docs):
    '''
  This function creates a posting list of the form :{docid:{term:tf}}
  for all the terms in the titles of all documents

  '''
    index = hashedindex.HashedIndex()
    for (k, v) in docs.items():
        for tokens in nltk.word_tokenize(v):
            if tokens not in string.punctuation:
                index.add_term_occurrence(k, tokens)
                index[k][tokens] = 10 * index[k][
                    tokens]  #tittleterms have been assigned 10*termfrequency weightage to give them more importance
    return index
コード例 #16
0
    def create_index(self, field_to_index: str):
        # an index already exists on field - no need to create again
        if exists_table_index_on_field(self.name, field_to_index):
            pass
        else:
            # hash-map index is created
            index = hashedindex.HashedIndex()
            db_table = read_json_file(self.name)
            for key in db_table.keys():
                if field_to_index in db_table[key].keys():
                    index.add_term_occurrence(db_table[key][field_to_index],
                                              key)

            write_to_json_file(f"{self.name}_{field_to_index}_index",
                               index.items())
コード例 #17
0
    def test_hashedindex_constructor_with_terms(self):
        index2 = hashedindex.HashedIndex(self.index.terms())

        # Terms between the two indexes should be equal
        assert unordered_list_cmp(index2.terms(), self.index.terms())

        # No documents should be found
        assert index2.documents() == []

        # All terms should have no referenced documents
        for term in index2.terms():
            assert index2[term] == {}

        index2.add_term_occurrence('phone', 'mydoc.doc')
        assert index2.get_term_frequency('phone', 'mydoc.doc') == 1
コード例 #18
0
ファイル: helper.py プロジェクト: awolkenhauer/mysearch
def index_search(search_term, file_list):
    word_count = {}
    inverted_index = hashedindex.HashedIndex()
    for text in file_list:
        with open(text, 'r') as file_input:
            clean_document = re.sub('[^a-zA-Z0-9\n\s]', '',
                                    file_input.read().lower())
            file_name = text.split('/')
            for term in clean_document.split():
                inverted_index.add_term_occurrence(term, file_name[1])
        try:
            result = inverted_index.get_documents(search_term)
            for key, value in result.items():
                word_count[text] = value
        except IndexError:
            word_count[text] = 0
            continue
    return word_count
コード例 #19
0
class Connection(base.DbConnection):
    index = hashedindex.HashedIndex()

    def query(self, audio, amnt_results=10):
        results = self.__perform_match(audio, amnt_results)
        return audio, results

    def add(self, audio):
        for word in self.__get_tokens(audio):
            self.index.add_term_occurrence(word, audio.filename)
        return audio

    def __perform_match(self, audio, k):
        counter = Counter()
        for word in self.__get_tokens(audio):
            try:
                counter = counter + self.index.get_documents(word)
            except IndexError:
                pass
        return self.__generate_results(counter.most_common(k))

    def __get_tokens(self, audio):
        '''
    For this dummy db only one set of tokens
    is used - the first in token's dictionary.
    '''
        key = list(audio.tokens.keys())[0]
        return audio.tokens[key].split(' ')

    def __generate_results(self, top_k_records):
        if len(top_k_records) == 0:
            return []
        score_base = top_k_records[0][1]
        #return counter
        return [
            MatchResult(r[0], r[1] / score_base, []) for r in top_k_records
        ]
コード例 #20
0
def store_inverted(melody_path, inverted_path):
    with open(melody_path, "r") as f:
        all_indices = json.load(f)
    hash_index = hashedindex.HashedIndex()
    for alias in all_indices:
        indices = all_indices[alias]["index"]
        wavefile = all_indices[alias]["wavefile"]
        for index in indices:
            # f1 = ((index[0] + 0.025) // 0.05) * 0.05
            f1 = int((index[0] * 100 + 2.5) // 5 * 5)
            t1 = int(((index[1] * 100 + 5) // 10) * 10)
            f2 = int((index[2] * 100 + 2.5) // 5 * 5)
            t2 = int(((index[3] * 100 + 5) // 10) * 10)
            f3 = int((index[4] * 100 + 2.5) // 5 * 5)
            hash_index.add_term_occurrence((f1, t1, f2, t2, f3), wavefile)
    inverted = {}
    items = hash_index.items()
    for key in items:
        counters = items[key].items()
        inverted[str(key)] = {}
        for file, times in counters:
            inverted[str(key)][file] = times
    with open(inverted_path, "w", encoding="utf-8") as f:
        f.write(json.dumps(inverted, indent=4))
コード例 #21
0
            hash_index.add_term_occurrence((f1, t1, f2, t2, f3), wavefile)
    inverted = {}
    items = hash_index.items()
    for key in items:
        counters = items[key].items()
        inverted[str(key)] = {}
        for file, times in counters:
            inverted[str(key)][file] = times
    with open(inverted_path, "w", encoding="utf-8") as f:
        f.write(json.dumps(inverted, indent=4))


if __name__ == "__main__":
    # store_melody(pitches_json,melody_json)
    # store_inverted(melody_json,"./inverted.json")
    hash_index = hashedindex.HashedIndex()
    with open("./inverted.json", "r") as f:
        inverted_index = json.load(f)
    with open("./test_melody.json", "r") as f:
        test_indices = json.load(f)
    for alias in test_indices:
        indices = test_indices[alias]["index"]
        wavefile = test_indices[alias]["wavefile"]
        allcounter = {}
        skip = 0
        for index in indices:
            f1 = int((index[0] * 100 + 2.5) // 5 * 5)
            t1 = int(((index[1] * 100 + 5) // 10) * 10)
            f2 = int((index[2] * 100 + 2.5) // 5 * 5)
            t2 = int(((index[3] * 100 + 5) // 10) * 10)
            f3 = int((index[4] * 100 + 2.5) // 5 * 5)
コード例 #22
0
 def test_repr(self):
     index = hashedindex.HashedIndex()
     assert str(index) == "<HashedIndex: 0 terms, 0 documents>"
     index.add_term_occurrence('foo', 'doc1.md')
     index.add_term_occurrence('bar', 'doc1.md')
     assert str(index) == "<HashedIndex: 2 terms, 1 documents>"
コード例 #23
0
 def __init__(self, dim):
     self.dim = dim
     self.inverted_index = hi.HashedIndex()
     self.np_arr = None
     self.length = 0
コード例 #24
0
ファイル: views.py プロジェクト: athul/TapSearch
def search(request):
    submitbutton = request.POST.get("submit")
    s_query = request.POST.get("q")
    
    form = searchForm(request.POST or None)
    if form.is_valid():
        #form.save()
        s_query = form.cleaned_data.get('query')
    query=request.POST.get('q')
    print(query)
    #code=index(request)
    
    indexer = hashedindex.HashedIndex()
    string_ip = """So since the train was Late. It was stopped in Erode and Held there for an Hour from 4 to 5 AM. We were to reach Bangalore by 8 in the Morning but yet here we are 200 Km from Bangalore. But yet it had its perks. I saw the Eastern Ghats for the First time. The Villages the train Passed in Tamil Nadu. Not eating Breakfast till Noon etc… So anyways we reached Bangalore by 11 AM in the Morning. We were planning to go First to Christ University Campus since Red Hat’s DevConf was being held. It was also an Oppurtunity to meet some fellow Campus Experts. So we got on a Bus from KSR to Christ University Campus. But due to our misinformation that bus was to Another Christ University Campus away from the City. Damn There were 3 Christ Campuses in bangalore Itself. So we got off somewhere and took a Cab to the venue of ETHIndia. We reached the Venue of ETHIndia by 12PM. We found one of our frined Sreeram there Vlogging with his Gimbal Camera.\n\n The Food was free and that was so delicious. It was a new cuisine which I had never tasted before. So we ate our Breakfast/Lunch and went to a conference hall. We got in when the guys from Lendroid were talking about their platform. Time passed and I found these things talks quite Boring. I got out of the room and met Anoop who goes by Kai as his handle. He was also from Kerala. Then Subin and Varun came. Then I met Allen and his team mate Ushana. In the evening Sreeram was interviewing Eylon from Do Stack and I met him and we listened to his story on how he got into Blockchain and stuff. Then there was a talk from Vitalik, the founder of Ethereum.
\n\n The hackathon actually started in the Evening by 7 PM. All of us were given a Bag with 2 Notebooks, Some Stickers and 4 Tshirts. Then I met Syamettan from Mozilla Kerala.He was the maintainer of Keralarescue.in . He was there for some venue arrangements. Then I also met two volunteers for the Hackathon who were friends of Syamettan. We started hacking on something dealing with IPFS and Decentralization. I made the frontend for the stuff. Time passed by. Then I actually met the person sitting infront of me who was also a Malayali. I was thrilled when I heard he was from Pala, Kottayam. At least someone who understands my slang(Kottayam’s Malayalam is the best form of malayalam. The other guys were not from Kottayam though, so it was hard for them to understand some words I was saying). A midnight we went just for a walk with Syamettan. We wanted a coffee but no shops were open. we came back. Oh I didnt tell you about the unlimited supply of Monster Energy Drink Right? Yep unlimited."""
    # To Lower and Indexing Function
    def replacer(s):
        s = s.replace(".", "")  # removed periods
        s = s.replace(",", "")  # removed commas
        s = s.replace("[", "")
        s = s.replace("]", "")
        s = s.replace("(", "")
        s = s.replace(")", "")
        s=s.replace("'","")
        return s
    def toLower(s):
        s=replacer(s)
        string_low = s.lower()  # convert to lowercase
        spl = string_low.split("\n\n")  # split with para
        uid_list = []
        for i in range(len(spl)):
            uid_list.append(i+1)
        return spl, len(spl), uid_list


    st_arr, length, uid = toLower(string_ip)

    # Converting List to String
    def toString(s):
        str = " "
        return str.join(s)


    sparr = toString(st_arr)
    sparr_split = sparr.split(" ")


    for i in range(length):
        for j in range(len(sparr_split)):
            indexer.add_term_occurrence(sparr_split[j], uid[i])


    def doc_splitter(query):
        for j in range(len(uid)):
            for i in range(len(st_arr)):
                sparr = toString(st_arr[i])
                sparr_split = sparr.split(" ")
                indexer.add_term_occurrence(sparr_split[i], uid[j])
        indexes=indexer.items()
        if query in indexes:
            print(indexes.get(query))
            return indexes.get(query)

    index_fin=doc_splitter(query)
    context = {"form": form,"index":index_fin}
    return render(request, "search/search.html", context)
コード例 #25
0
 def test_merge_index_empty(self):
     assert hashedindex.merge([]) == hashedindex.HashedIndex()
コード例 #26
0
    def test_integrity(self):
        index2 = hashedindex.HashedIndex()
        index2.from_dict(self.index.to_dict())

        assert index2 == self.index