Esempio n. 1
0
    def __init__(self,
                 minsize=2,
                 maxsize=4,
                 stored=False,
                 field_boost=1.0,
                 queryor=False,
                 phrase=False):
        """
        :param minsize: The minimum length of the N-grams.
        :param maxsize: The maximum length of the N-grams.
        :param stored: Whether to store the value of this field with the
            document. Since this field type generally contains a lot of text,
            you should avoid storing it with the document unless you need to,
            for example to allow fast excerpts in the search results.
        :param queryor: if True, combine the N-grams with an Or query. The
            default is to combine N-grams with an And query.
        :param phrase: store positions on the N-grams to allow exact phrase
            searching. The default is off.
        """

        formatclass = formats.Frequency
        if phrase:
            formatclass = formats.Positions

        self.analyzer = NgramAnalyzer(minsize, maxsize)
        self.format = formatclass(field_boost=field_boost)
        self.stored = stored
        self.queryor = queryor
Esempio n. 2
0
    def get_words(self, unit):
        """Return list of word pairs for an unit."""
        words = set()

        # Prepare analyzers
        # - standard analyzer simply splits words
        # - stemming extracts stems, to catch things like plurals
        analyzers = [
            (SimpleAnalyzer(), True),
            (SimpleAnalyzer(expression=SPLIT_RE, gaps=True), True),
            (StandardAnalyzer(), False),
            (StemmingAnalyzer(), False),
        ]
        source_language = unit.translation.subproject.project.source_language
        lang_code = source_language.base_code()
        # Add per language analyzer if Whoosh has it
        if has_stemmer(lang_code):
            analyzers.append((LanguageAnalyzer(lang_code), False))
        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append((NgramAnalyzer(4), False))

        # Extract words from all plurals and from context
        for text in unit.get_source_plurals() + [unit.context]:
            for analyzer, combine in analyzers:
                # Some Whoosh analyzers break on unicode
                new_words = []
                try:
                    new_words = [token.text for token in analyzer(text)]
                except (UnicodeDecodeError, IndexError) as error:
                    report_error(error, sys.exc_info())
                words.update(new_words)
                # Add combined string to allow match against multiple word
                # entries allowing to combine up to 5 words
                if combine:
                    words.update([
                        ' '.join(new_words[x:y]) for x in range(len(new_words))
                        for y in range(1, min(x + 6,
                                              len(new_words) + 1)) if x != y
                    ])

        # Grab all words in the dictionary
        dictionary = self.filter(project=unit.translation.subproject.project,
                                 language=unit.translation.language)

        if '' in words:
            words.remove('')

        if len(words) == 0:
            # No extracted words, no dictionary
            dictionary = dictionary.none()
        else:
            # Build the query for fetching the words
            # Can not use __in as we want case insensitive lookup
            dictionary = dictionary.filter(source__iregex=r'^({0})$'.format(
                '|'.join([re_escape(word) for word in words])))

        return dictionary
Esempio n. 3
0
    def get_words(self, unit):
        """
        Returns list of word pairs for an unit.
        """
        words = set()

        # Prepare analyzers
        # - standard analyzer simply splits words
        # - stemming extracts stems, to catch things like plurals
        analyzers = [
            StandardAnalyzer(),
            StemmingAnalyzer(),
        ]
        source_language = unit.translation.subproject.project.source_language
        lang_code = source_language.base_code()
        # Add per language analyzer if Whoosh has it
        if has_stemmer(lang_code):
            analyzers.append(LanguageAnalyzer(lang_code))
        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append(NgramAnalyzer(4))

        # Extract words from all plurals and from context
        for text in unit.get_source_plurals() + [unit.context]:
            for analyzer in analyzers:
                # Some Whoosh analyzers break on unicode
                try:
                    words.update(
                        [token.text for token in analyzer(force_text(text))]
                    )
                except (UnicodeDecodeError, IndexError) as error:
                    report_error(error, sys.exc_info())

        # Grab all words in the dictionary
        dictionary = self.filter(
            project=unit.translation.subproject.project,
            language=unit.translation.language
        )

        if len(words) == 0:
            # No extracted words, no dictionary
            dictionary = dictionary.none()
        else:
            # Build the query for fetching the words
            # Can not use __in as we want case insensitive lookup
            query = Q()
            for word in words:
                query |= Q(source__iexact=word)

            # Filter dictionary
            dictionary = dictionary.filter(query)

        return dictionary
Esempio n. 4
0
def load_states():
    analyzer = NgramAnalyzer(1, 2)
    state_schema = Schema(state=ID(stored=True, analyzer=analyzer))
    with cursor() as cur:
        print('Loading states...')
        cur.execute('SELECT DISTINCT state FROM msa')
        state_index = storage.create_index(state_schema)
        writer = state_index.writer()
        for s in cur.fetchall():
            writer.add_document(state=s[u'state'])
        writer.commit()
    return state_index
Esempio n. 5
0
    def get_words(self, unit):
        """Return list of word pairs for an unit."""
        words = set()
        source_language = unit.translation.component.project.source_language

        # Filters stop words for a language
        try:
            stopfilter = StopFilter(lang=source_language.base_code)
        except NoStopWords:
            stopfilter = StopFilter()

        # Prepare analyzers
        # - simple analyzer just splits words based on regexp
        # - language analyzer if available (it is for English)
        analyzers = [
            SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter,
            LanguageAnalyzer(source_language.base_code),
        ]

        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append(NgramAnalyzer(4))

        # Extract words from all plurals and from context
        flags = unit.all_flags
        for text in unit.get_source_plurals() + [unit.context]:
            text = strip_string(text, flags).lower()
            for analyzer in analyzers:
                # Some Whoosh analyzers break on unicode
                try:
                    words.update(token.text for token in analyzer(text))
                except (UnicodeDecodeError, IndexError) as error:
                    report_error(error)
                if len(words) > 1000:
                    break
            if len(words) > 1000:
                break

        if '' in words:
            words.remove('')

        if not words:
            # No extracted words, no dictionary
            return self.none()

        # Build the query for fetching the words
        # We want case insensitive lookup
        return self.filter(
            project=unit.translation.component.project,
            language=unit.translation.language,
            source__iregex=r'(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])'.format(
                '|'.join(re_escape(word) for word in islice(words, 1000))),
        )
Esempio n. 6
0
def exec_comp():
    '''
    Method that calculates MRR: Mean Reciprocal Rank and saves a table with MRR evaluation for every search engine configuration 
    '''
    #text analyzers
    selected_analyzers = [
        StemmingAnalyzer(),
        SimpleAnalyzer(),
        StandardAnalyzer(),
        RegexAnalyzer(),
        FancyAnalyzer(),
        NgramAnalyzer(5),
        KeywordAnalyzer(),
        LanguageAnalyzer('en')
    ]  #text analyzers
    sel_ana = [
        'StemmingAnalyzer()', 'SimpleAnalyzer()', 'StandardAnalyzer()',
        'RegexAnalyzer()', 'FancyAnalyzer()', 'NgramAnalyzer(5)',
        'KeywordAnalyzer()', 'LanguageAnalyzer()'
    ]  #text which will be used for graph and for mrr table

    i = 0  #counter
    mrrs = []  #list where MRR values for each SE configuration will be stored

    #scoring functions
    scoring_functions = [
        scoring.TF_IDF(),
        scoring.Frequency(),
        scoring.BM25F(B=0.75, content_B=1.0, K1=1.5)
    ]
    scor_func = [' TF_IDF', ' Frequency', ' BM25F']

    #ground truth
    gt1 = pd.read_csv(os.getcwd() +
                      "/part_1/Cranfield_DATASET/cran_Ground_Truth.tsv",
                      sep='\t')

    #combinations for every chosen analyzer with every chosen scoring function
    for x in range(len(selected_analyzers)):
        for y in range(len(scoring_functions)):
            print(sel_ana[x] + scor_func[y])
            i = i + 1
            sr_1 = exec_queries(
                selected_analyzers[x], scoring_functions[y]
            )  # execute queries for the chosen configuration combination
            sr_1.to_csv(os.getcwd() + "/part_1/" + str(i) + "__.csv",
                        index=False)  #save results of the search engine
            mrrs.append((sel_ana[x] + scor_func[y], mrr(gt1,
                                                        sr_1)))  #calculate MRR
    mrrs_saving = pd.DataFrame(mrrs)
    mrrs_saving.to_csv(os.getcwd() + "/part_1/mrrs.csv",
                       index=False)  #store MRR table
Esempio n. 7
0
    def __init__(self, minsize=2, maxsize=4, stored=False):
        """
        :param stored: Whether to store the value of this field with the document. Since
            this field type generally contains a lot of text, you should avoid storing it
            with the document unless you need to, for example to allow fast excerpts in the
            search results.
        :param minsize: The minimum length of the N-grams.
        :param maxsize: The maximum length of the N-grams.
        """

        self.format = Frequency(analyzer=NgramAnalyzer(minsize, maxsize))
        self.scorable = True
        self.stored = stored
Esempio n. 8
0
def load_cities():
    analyzer = NgramAnalyzer(1)
    city_schema = Schema(state=ID(stored=True),
                         city=ID(stored=True, analyzer=analyzer))
    with cursor() as cur:
        print('Loading cities...')
        cur.execute('SELECT DISTINCT state, city FROM msa')
        city_index = storage.create_index(city_schema)
        writer = city_index.writer()
        for s in cur.fetchall():
            writer.add_document(state=s[u'state'], city=s[u'city'])
        writer.commit()
    return city_index
Esempio n. 9
0
def load_occs():
    analyzer = NgramAnalyzer(3)
    occ_schema = Schema(occ_title=TEXT(stored=True, analyzer=analyzer),
                        occ_code=ID(stored=True))
    with cursor() as cur:
        print('Loading occs...')
        cur.execute('SELECT DISTINCT occ_code, occ_title FROM msa')
        occ_index = storage.create_index(occ_schema)
        writer = occ_index.writer()
        for s in cur.fetchall():
            writer.add_document(occ_title=s[u'occ_title'],
                                occ_code=s[u'occ_code'])
        writer.commit()
    return occ_index
Esempio n. 10
0
    def get_terms(self, unit):
        """Return list of term pairs for an unit."""
        words = set()
        source_language = unit.translation.component.project.source_language

        # Filters stop words for a language
        try:
            stopfilter = StopFilter(lang=source_language.base_code)
        except NoStopWords:
            stopfilter = StopFilter()

        # Prepare analyzers
        # - basic simple analyzer to split on non-word chars
        # - simple analyzer just splits words based on regexp to catch in word dashes
        # - language analyzer if available (it is for English)
        analyzers = [
            SimpleAnalyzer() | stopfilter,
            SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter,
            LanguageAnalyzer(source_language.base_code),
        ]

        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append(NgramAnalyzer(4))

        # Extract words from all plurals and from context
        flags = unit.all_flags
        for text in unit.get_source_plurals() + [unit.context]:
            text = strip_string(text, flags).lower()
            for analyzer in analyzers:
                # Some Whoosh analyzers break on unicode
                try:
                    words.update(token.text for token in analyzer(text))
                except (UnicodeDecodeError, IndexError):
                    report_error(cause="Term words parsing")
                if len(words) > 1000:
                    break
            if len(words) > 1000:
                break

        if "" in words:
            words.remove("")

        if not words:
            # No extracted words, no glossary
            return self.none()

        # Build the query for fetching the words
        # We want case insensitive lookup
        words = islice(words, 1000)
        if settings.DATABASES["default"][
                "ENGINE"] == "django.db.backends.postgresql":
            # Use regex as that is utilizing pg_trgm index
            results = self.filter(
                source__iregex=r"(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])".
                format("|".join(re_escape(word) for word in words)), )
        else:
            # MySQL
            results = self.filter(
                reduce(
                    lambda x, y: x | y,
                    (models.Q(source__search=word) for word in words),
                ), )

        return results.for_project(unit.translation.component.project).filter(
            language=unit.translation.language)
Esempio n. 11
0
        #
        num_added_records_so_far += 1
        if (num_added_records_so_far % 100 == 0):
            print(" num_added_records_so_far= " + str(num_added_records_so_far))
    #
    writer.commit()  # it is necessary to store the index once filled
    in_file.close()  # it is necessary to close the .csv file


'''
Here "schemas" function is used to create and fill all the schemas(indexes) for both .csv files (Cranfield.csv and Time.csv)

'''

analyzers = [StemmingAnalyzer(), StandardAnalyzer(), RegexAnalyzer(), SimpleAnalyzer(),
             FancyAnalyzer(), NgramAnalyzer(4), KeywordAnalyzer(), LanguageAnalyzer('en')] # all the analyzers that are used
analyzer_names = ['StemmingAnalyzer', 'StandardAnalyzer', 'RegexAnalyzer', 'SimpleAnalyzer',
                 'FancyAnalyzer', 'NgramAnalyzer', 'KeywordAnalyzer',  'LanguageAnalyzer'] # analyzers names

csv_names = ['Cranfield', 'Time'] # file names



# start to iterate over all the .csv files (in particular the only two that there are, Cranfield.csv, and Time.csv)
for name in csv_names: 
    
    print(name, '\n\n')
    
    path = "C:./"+name+"_DATASET" # get the path where the .csv is stored
    for e,type_analyzer in enumerate(analyzers): # now the iteration is necessary to create the 8 different inverted indexes
        
Esempio n. 12
0
    def get_words(self, unit):
        """Return list of word pairs for an unit."""
        words = set()
        source_language = unit.translation.component.project.source_language

        # Filters stop words for a language
        try:
            stopfilter = StopFilter(lang=source_language.base_code)
        except NoStopWords:
            stopfilter = StopFilter()

        # Prepare analyzers
        # - simple analyzer just splits words based on regexp
        # - language analyzer if available (it is for English)
        analyzers = [
            SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter,
            LanguageAnalyzer(source_language.base_code),
        ]

        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append(NgramAnalyzer(4))

        # Extract words from all plurals and from context
        flags = unit.all_flags
        for text in unit.get_source_plurals() + [unit.context]:
            text = strip_string(text, flags).lower()
            for analyzer in analyzers:
                # Some Whoosh analyzers break on unicode
                try:
                    words.update(token.text for token in analyzer(text))
                except (UnicodeDecodeError, IndexError) as error:
                    report_error(error)
                if len(words) > 1000:
                    break
            if len(words) > 1000:
                break

        if "" in words:
            words.remove("")

        if not words:
            # No extracted words, no dictionary
            return self.none()

        # Build the query for fetching the words
        # We want case insensitive lookup
        words = islice(words, 1000)
        if settings.DATABASES["default"][
                "ENGINE"] == "django.db.backends.postgresql":
            results = self.filter(source__search=reduce(
                lambda x, y: x | y, (SearchQuery(word) for word in words)), )
        else:
            # MySQL
            results = self.filter(
                reduce(
                    lambda x, y: x | y,
                    (models.Q(source__search=word) for word in words),
                ), )

        return results.filter(
            project=unit.translation.component.project,
            language=unit.translation.language,
        )
Esempio n. 13
0
from whoosh.fields import Schema, TEXT, STORED, NGRAMWORDS
from whoosh.index import create_in, open_dir
from whoosh.analysis import StandardAnalyzer, StemmingAnalyzer, NgramFilter, NgramAnalyzer, NgramWordAnalyzer
#from whoosh.query import *
from whoosh.qparser import QueryParser, MultifieldParser, FieldsPlugin

analyzer = NgramAnalyzer(3)
schema = Schema(
    id=STORED,
    category=TEXT(field_boost=3.0),
    #title = TEXT(analyzer, False)
    title=NGRAMWORDS(2, 20, False, 2.0))

index = create_in("search", schema)
#index = open_dir("search")

writer = index.writer()
writer.add_document(id=0, title="Test Words")
writer.add_document(id=1, title="Apple Banana Cucumber")
writer.add_document(id=2, title="Deck Elevator Floor", category="test")
writer.add_document(id=3, title="Pen Pineapple Apple Pen")
writer.commit()

#parser = QueryParser("title", schema)
parser = MultifieldParser(["category", "title"], schema, {
    "category": 3.0,
    "title": 2.0
})
parser.remove_plugin_class(FieldsPlugin)

with index.searcher() as searcher: