def process_Query_Type(self,tokens):
        boost_params = []
        boosts = {"title_sinhala":1,"artist_name":1,"writer_name":1,"music":1,"movie":1,"genre":1,"lyrics":1}
        additional_tokens =[]
        for token in tokens:
            splits = word_splitter.split(token)
            additional_tokens.append(splits['base'])

            if(token in views_boosters or splits['affix'] in views_boosters or splits['base'] in views_boosters):
                boost_params.append("view")
            if(token in artist_name_boosters or splits['affix'] in artist_name_boosters or splits['base'] in artist_name_boosters):
                boost_params.append("artist_name")
                boosts['artist_name'] = 2
            if(token in writer_name_boosters or splits['affix'] in writer_name_boosters or splits['base'] in writer_name_boosters):
                boost_params.append("writer_name")
                boosts['writer_name'] = 2
            if(token in music_boosters or splits['affix'] in music_boosters or splits['base'] in music_boosters):
                boost_params.append("music")
                boosts['music'] = 2
            if(token in movie_boosters or splits['affix'] in movie_boosters or splits['base'] in movie_boosters):
                boost_params.append("movie")
                boosts['movie'] = 2
            if(token in genre_boosters or splits['affix'] in genre_boosters or splits['base'] in genre_boosters):
                boost_params.append("genre")
                boosts['genre'] = 2
       
        query_mod = " ".join(tokens+additional_tokens)
        return set(boost_params),boosts,query_mod
 def predictQType(self,tokens):
     boost_params = []
     boosts = {"title_si":1,"artist":1,"writer":1,"music":1,"genre":1.0,"lyrics":1}
     additional_tokens =[]
     print(tokens)
     for token in tokens:
         #split the tokens to identify affixes
         splits = word_splitter.split(token)
         additional_tokens.append(splits['base'])
         #add base to query depending on the threshold
         if(token in rating_boosters or splits['affix'] in rating_boosters or splits['base'] in rating_boosters):
             boost_params.append("rate")
         if(token in artist_boosters or splits['affix'] in artist_boosters or splits['base'] in artist_boosters):
             boost_params.append("artist")
             boosts['artist'] = 2
         if(token in writer_boosters or splits['affix'] in writer_boosters or splits['base'] in writer_boosters):
             boost_params.append("writer")
             boosts['writer'] = 2
         if(token in music_boosters or splits['affix'] in music_boosters or splits['base'] in music_boosters):
             boost_params.append("music")
             boosts['music'] = 2
         if(token in genre_boosters or splits['affix'] in genre_boosters or splits['base'] in genre_boosters):
             boost_params.append("genre")
             boosts['genre'] = 2
         #append music as well.
     query_mod = " ".join(tokens+additional_tokens)
     return set(boost_params),boosts,query_mod
Beispiel #3
0
def process_word(sentence):
    raw_list = sentence.split()
    temp_list = []
    for raw_word in raw_list:
        if len(raw_word) < 10:
            temp_list.append(raw_word)
            continue
        result = word_splitter.split(raw_word)
        temp_list.append(result['base'])
        temp_list.append(result['affix'])
    final_query = " "
    for piece in temp_list:
        final_query = final_query + " " + piece
    return temp_list, final_query
Beispiel #4
0
def create_orig_to_base_json(lines: list):
    # lines is a list of sentences
    from sinling import word_splitter as ws
    unique_file_name = str(uuid.uuid4()) + ".json"
    output_dict = dict()
    for sentence in lines:
        words = sentence.split()
        for word in words:
            # word = word.replace('\u200d', '')
            if is_sinhala_word(word):
                try:
                    base = ws.split(word)['base']
                    print(word, base)
                    if word not in output_dict:
                        output_dict[word] = base
                except Exception as e:
                    print("word:", word)
                    print(e, "\n")

    with open(unique_file_name, 'w', encoding='utf8') as outfile:
        json.dump(output_dict, outfile, ensure_ascii=False)
    def identifyContext(self, tokens, query):
        boosting_fields = []
        boosting_data = {}
        print(tokens)
        for token in tokens:
            results = word_splitter.split(token)
            #Split the token into affix and the base
            if (results['affix'] == "ගේ"):
                query = query.replace(token, results['base'])
            if ("ගීත" in token):
                query = query.replace("ගීත", " ")
            #print(results, token)
            if (token in rating_identifiers
                    or results['affix'] in rating_identifiers
                    or results['base'] in rating_identifiers):
                boosting_fields.append("rate")
                print(token)
                query = self.replaceUnwantedData(
                    query, [token, results['base'], results['affix']])
            if (token in artist_identifiers
                    or results['affix'] in artist_identifiers
                    or results['base'] in artist_identifiers):
                boosting_fields.append("artist")
                boosting_data['artist'] = 2.0
                query = self.replaceUnwantedData(query, [results['affix']])
            if (token in writer_identifiers
                    or results['affix'] in writer_identifiers
                    or results['base'] in writer_identifiers):
                boosting_fields.append("writer")
                boosting_data['writer'] = 2.0
                query = self.replaceUnwantedData(
                    query, [token, results['base'], results['affix']])
            if (token in genre_boosters or results['affix'] in genre_boosters
                    or results['base'] in genre_boosters):
                boosting_fields.append("genre")
                boosting_data['genre'] = 3.0
            #append music as well.

        return list(set(boosting_fields)), boosting_data, query
Beispiel #6
0
def get_base(word):
    return word_splitter.split(word)['base']
Beispiel #7
0
def split():
    query_request = request.json
    print(query_request)
    results = word_splitter.split(query_request["word"])
    return results
def query_process(query):
    artist_flag = False
    splited_sentence = tokenizer.tokenize(query)
    print(splited_sentence)
    for word in splited_sentence:
        if word in artist_consider:
            artist_flag = True
            break
        else:
            print(word_splitter.split(word))
            affix = word_splitter.split(word)['affix']
            base = word_splitter.split(word)['base']
            print(affix)
            if affix in artist_consider:
                artist_flag = True
                query = query + ' ' + base
                break
    print(artist_flag)
    if artist_flag:
        body = {
            "query": {
                "multi_match": {
                    "type":
                    "most_fields",
                    "query":
                    query,
                    "fields": [
                        "artist^3.0", "lyrics^1.0", "title^1.0",
                        "musicArtist^1.0", "lyricsArtist^1.0", "genre^1.0",
                        "movie^1.0"
                    ]
                }
            },
            "aggs": {
                "rate_range": {
                    "range": {
                        "field":
                        "rate",
                        "ranges": [{
                            "from": 0,
                            "to": 1000
                        }, {
                            "from": 1000,
                            "to": 2000
                        }, {
                            "from": 2000,
                            "to": 3000
                        }, {
                            "from": 3000,
                            "to": 4000
                        }, {
                            "from": 4000,
                            "to": 5000
                        }, {
                            "from": 5000,
                            "to": 6000
                        }, {
                            "from": 6000,
                            "to": 7000
                        }, {
                            "from": 7000,
                            "to": 8000
                        }, {
                            "from": 8000,
                            "to": 9000
                        }, {
                            "from": 9000,
                            "to": 10000
                        }]
                    }
                }
            }
        }
    else:
        body = {
            "query": {
                "multi_match": {
                    "type":
                    "most_fields",
                    "query":
                    query,
                    "fields": [
                        "artist", "lyrics", "title", "musicArtist",
                        "lyricsArtist", "genre", "movie"
                    ]
                }
            },
            "aggs": {
                "rate_range": {
                    "range": {
                        "field":
                        "rate",
                        "ranges": [{
                            "from": 0,
                            "to": 1000
                        }, {
                            "from": 1000,
                            "to": 2000
                        }, {
                            "from": 2000,
                            "to": 3000
                        }, {
                            "from": 3000,
                            "to": 4000
                        }, {
                            "from": 4000,
                            "to": 5000
                        }, {
                            "from": 5000,
                            "to": 6000
                        }, {
                            "from": 6000,
                            "to": 7000
                        }, {
                            "from": 7000,
                            "to": 8000
                        }, {
                            "from": 8000,
                            "to": 9000
                        }, {
                            "from": 9000,
                            "to": 10000
                        }]
                    }
                }
            }
        }
    return body
Beispiel #9
0
def search():
      boosting_list=["name"]
      query=[]
      is_rating_query=False
      numeric_value=0
      query_request= request.form["query"].strip().lower()
      processed_query_request=""
      token_list=tokenizer.tokenize(query_request)
      
      
      # boosting_list[1] = ["artist^3" if ("ගේ" in artist) else "artist" for artist in  token_list][0]
      # boosting_list[2] = ["genere^3" if (genere in token_list) else "genere" for genere in  genere_list][0]

      if(any(x in genere_list for x in token_list)):
            boosting_list.append("genere^2")
      if(any(x in lyrics_by_list for x in token_list)):
            boosting_list.append("lyrics by^2")
      if(any(x in music_by_list for x in token_list)):
            boosting_list.append("music by^2")
      if(any(x in key_list for x in token_list)):
            boosting_list.append("key^2")
      if(any(("ගේ" in x or x in artist_list) for x in token_list)):
            boosting_list.append("artist^2")
      if(any(x in popular_list for x in token_list)):
            token_list = [i for i in token_list if i not in popular_list] 
            is_rating_query= True
      if(any(x.isnumeric() for x in token_list)):
            for x in token_list:
                  if (x.isnumeric()):
                        print(x)
                        token_list.remove(x)
                        numeric_value=int(x)

      # = [ if (lyrics_by in token_list) else "lyrics by"for lyrics_by in  lyrics_by_list][0]
      # boosting_list[4] = ["music by^3" if (music_by in token_list) else "music by"for music_by in  music_by_list][0]
      # boosting_list[5] = ["key^3" if (music_by in token_list) else "key"for music_by in  music_by_list][0]

      
      # affix=word_splitter.split(query_request)["affix"]

      processed_query_request =" ".join([word_splitter.split(item)["base"] if len(item)>5 else item for item in token_list]) 

      # result = es.search(index="lyrics", doc_type="doc",body={  "query": {"match" : { "genere": affix}}})

      boosting_list=list(dict.fromkeys(boosting_list))
      print(numeric_value)

      print(query_request) 
      print(token_list)
      print(processed_query_request)
      print(boosting_list)
      print(is_rating_query)
      if(len(processed_query_request)==0):
            query= {"query" : {
                  "match_all" :{}
             }
            }
      else:
            query= {"query" : {
                        "multi_match" : {
                              "query" : processed_query_request,
                              "fields" : boosting_list
                        }
                  }
            }
      # if(len(boosting_list)==0):
      #        boosting_list=["name", "artist","genere","lyrics by","music by","key","lyrics" ]  

      if(is_rating_query):
            query["sort"]=[{'views':'desc'}]
      if(numeric_value !=0):
            query["size"] = numeric_value


      boosted_query= es.search(index="lyrics", doc_type="doc",body=json.dumps(query))

      hits = boosted_query["hits"]["hits"]

      if(len(hits)==0):
            return render_template('ui.html', result = "No search result exists")
      lyrcs_list=[lyrics["_source"] for lyrics in hits ]
      return render_template('ui.html', results = lyrcs_list)
Beispiel #10
0
sys.path.append("/home/basa/sinling/Sinling")

from sinling import word_splitter

filepath = "new.txt"
a = {}
b = {}
with open(filepath, encoding="utf8") as fp:
    for line in fp:
        x = line.strip().split(' ')

        for j in x:
            a[j] = 0

for i in a:
    results = word_splitter.split(i)
    b[results['base']] = 0

f = open('roots.txt', 'w', encoding="utf8")
#print (a)
for i in b:

    f.write(i + "\n")
    #print (results)

f.close()
#word = "බළ්ලාට"
#results = word_splitter.split(word)
#print (results['base'])