Example #1
0
 def run(self):
     time.sleep(0.1)
     kwargs = self.kwargs
     self.results = []
     context = kwargs.pop('context')
     if context == RestrictedContextWidget.canonical_value:
         cm = CanonicalVariantContext
     elif context == RestrictedContextWidget.frequent_value:
         cm = MostFrequentVariantContext
     corpus = kwargs['corpusModel'].corpus
     st = kwargs['sequence_type']
     tt = kwargs['type_token']
     att = kwargs.get('attribute', None)
     with cm(corpus, st, tt, att) as c:
         try:
             if 'query' in kwargs:
                 for q in kwargs['query']:
                     if kwargs['algorithm'] != 'substitution':
                         res = neighborhood_density(c, q,
                                             algorithm = kwargs['algorithm'],
                                             max_distance = kwargs['max_distance'],
                                             stop_check = kwargs['stop_check'],
                                             call_back = kwargs['call_back'])
                     else:
                         res = find_mutation_minpairs(c, q,
                                     stop_check = kwargs['stop_check'],
                                     call_back = kwargs['call_back'])
                     if 'output_filename' in kwargs and kwargs['output_filename'] is not None:
                         print_neighden_results(kwargs['output_filename'],res[1])
                     if self.stopped:
                         break
                     self.results.append([q,res[0]])
             else:
                 end = kwargs['corpusModel'].beginAddColumn(att)
                 if kwargs['algorithm'] != 'substitution':
                     neighborhood_density_all_words(c,
                                             algorithm = kwargs['algorithm'],
                                             max_distance = kwargs['max_distance'],
                                             num_cores = kwargs['num_cores'],
                                             call_back = kwargs['call_back'],
                                             stop_check = kwargs['stop_check']
                                             )
                 else:
                     find_mutation_minpairs_all_words(c,
                                             num_cores = kwargs['num_cores'],
                                             stop_check = kwargs['stop_check'],
                                             call_back = kwargs['call_back'])
                 end = kwargs['corpusModel'].endAddColumn(end)
         except PCTError as e:
             self.errorEncountered.emit(e)
             return
         except Exception as e:
             e = PCTPythonError(e)
             self.errorEncountered.emit(e)
             return
     if self.stopped:
         self.finishedCancelling.emit()
         return
     self.dataReady.emit(self.results)
def test_basic_corpus_mutation_minpairs(specified_test_corpus):
    calls = [({'query':Word(**{'transcription': ['s', 'ɑ', 't', 'ɑ']}),
                    },2)]

    with CanonicalVariantContext(specified_test_corpus, 'transcription', 'type') as c:
        for kwargs,v in calls:
            result = find_mutation_minpairs(c, **kwargs)
            assert(result[0] == v)
            assert(sorted(result[1]) == sorted(['n.ɑ.t.ɑ', 'm.ɑ.t.ɑ']))
def test_basic_corpus_mutation_minpairs(specified_test_corpus):
    calls = [({'query':Word(**{'transcription': ['s', 'ɑ', 't', 'ɑ']}),
                    },2)]

    with CanonicalVariantContext(specified_test_corpus, 'transcription', 'type') as c:
        for kwargs,v in calls:
            result = find_mutation_minpairs(c, **kwargs)
            assert(result[0] == v)
            assert(sorted(result[1]) == sorted(['n.ɑ.t.ɑ', 'm.ɑ.t.ɑ']))
Example #4
0
    def run(self):
        kwargs = self.kwargs
        self.results = []
        context = kwargs.pop('context')
        if context == RestrictedContextWidget.canonical_value:
            cm = CanonicalVariantContext
        elif context == RestrictedContextWidget.frequent_value:
            cm = MostFrequentVariantContext
        corpus = kwargs['corpusModel'].corpus
        st = kwargs['sequence_type']
        tt = kwargs['type_token']
        att = kwargs.get('attribute', None)
        ft = kwargs['frequency_cutoff']
        output = list()

        with cm(corpus, st, tt, attribute=att, frequency_threshold=ft) as c:
            try:
                tierdict = defaultdict(list)
                # Create a dict with sequence_type keys for constant-time lookup
                for entry in c:
                    w = getattr(entry, kwargs['sequence_type'])
                    key = str(w)
                    tierdict[key].append(entry)
                if 'query' in kwargs:  #this will be true when searching for a single word (in the corpus or not)
                    last_value_removed = None
                    last_key_removed = None
                    for q in kwargs['query']:
                        q = ensure_query_is_word(q, c, c.sequence_type,
                                                 kwargs['tier_type'])
                        #the following code for adding/removing keys is to ensure that homophones are counted later in
                        #the ND algorithm (if the user wants to), but that words are not considered their own neighbours
                        #however, we only do this when comparing inside a corpus. when using a list of external words
                        #we don't want to do this, since it's possible for the external list to contain words that
                        #are in the corpus, and removing them gives the wrong ND value in this case
                        if kwargs['in_corpus']:
                            if last_value_removed:
                                tierdict[last_key_removed].append(
                                    last_value_removed)
                            w = getattr(q, kwargs['sequence_type'])
                            last_key_removed = str(w)
                            #last_value_removed = tierdict[last_key_removed].pop()
                            for i, item in enumerate(
                                    tierdict[last_key_removed]):
                                if str(item) == str(q):
                                    last_value_removed = tierdict[
                                        last_key_removed].pop(i)
                                    break

                        #now we call the actual ND algorithms
                        if kwargs['algorithm'] != 'substitution':
                            res = neighborhood_density(
                                c,
                                q,
                                tierdict,
                                algorithm=kwargs['algorithm'],
                                max_distance=kwargs['max_distance'],
                                force_quadratic=kwargs['force_quadratic'],
                                collapse_homophones=kwargs[
                                    'collapse_homophones'],
                                file_type=kwargs['file_type'],
                                tier_type=kwargs['tier_type'],
                                sequence_type=kwargs['sequence_type'],
                                stop_check=kwargs['stop_check'],
                                call_back=kwargs['call_back'])
                        else:
                            res = find_mutation_minpairs(
                                c,
                                q,
                                tier_type=kwargs['tier_type'],
                                collapse_homophones=kwargs[
                                    'collapse_homophones'],
                                stop_check=kwargs['stop_check'],
                                call_back=kwargs['call_back'])
                        if 'output_filename' in kwargs and kwargs[
                                'output_filename'] is not None:
                            print_neighden_results(kwargs['output_filename'],
                                                   res[1],
                                                   kwargs['output_format'])
                        if self.stopped:
                            break
                        if kwargs['file_list'] is not None:
                            output.append(','.join([
                                str(q),
                                str(res[0]), ','.join([str(r) for r in res[1]])
                            ]))
                        self.results.append([q, res[0]])
                else:  #this will be the case if searching the entire corpus
                    end = kwargs['corpusModel'].beginAddColumn(att)
                    if kwargs['algorithm'] != 'substitution':
                        results = neighborhood_density_all_words(
                            c,
                            tierdict,
                            tier_type=kwargs['tier_type'],
                            algorithm=kwargs['algorithm'],
                            output_format=kwargs['output_format'],
                            max_distance=kwargs['max_distance'],
                            num_cores=kwargs['num_cores'],
                            call_back=kwargs['call_back'],
                            stop_check=kwargs['stop_check'],
                            settable_attr=kwargs['attribute'],
                            collapse_homophones=kwargs['collapse_homophones'])
                    else:
                        results = find_mutation_minpairs_all_words(
                            c,
                            tierdict,
                            tier_type=kwargs['tier_type'],
                            collapse_homophones=kwargs['collapse_homophones'],
                            num_cores=kwargs['num_cores'],
                            stop_check=kwargs['stop_check'],
                            call_back=kwargs['call_back'])
                    end = kwargs['corpusModel'].endAddColumn(end)
                    if 'output_filename' in kwargs and kwargs[
                            'output_filename'] is not None:
                        print_all_neighden_results(kwargs['output_filename'],
                                                   results)
            except PCTError as e:
                self.errorEncountered.emit(e)
                return
            except Exception as e:
                e = PCTPythonError(e)
                self.errorEncountered.emit(e)
                return
        if self.stopped:
            self.finishedCancelling.emit()
            return
        if output and 'output_filename' in kwargs:
            with open(kwargs['output_filename'], encoding='utf-8',
                      mode='w') as outf:
                print('Word,Density,Neighbors', file=outf)
                for item in output:
                    print(item, file=outf)
        self.dataReady.emit(self.results)
Example #5
0
 def run(self):
     time.sleep(0.1)
     kwargs = self.kwargs
     self.results = []
     context = kwargs.pop('context')
     if context == RestrictedContextWidget.canonical_value:
         cm = CanonicalVariantContext
     elif context == RestrictedContextWidget.frequent_value:
         cm = MostFrequentVariantContext
     corpus = kwargs['corpusModel'].corpus
     st = kwargs['sequence_type']
     tt = kwargs['type_token']
     att = kwargs.get('attribute', None)
     with cm(corpus, st, tt, att) as c:
         try:
             if 'query' in kwargs:
                 for q in kwargs['query']:
                     if kwargs['algorithm'] != 'substitution':
                         res = neighborhood_density(
                             c,
                             q,
                             algorithm=kwargs['algorithm'],
                             max_distance=kwargs['max_distance'],
                             stop_check=kwargs['stop_check'],
                             call_back=kwargs['call_back'])
                     else:
                         res = find_mutation_minpairs(
                             c,
                             q,
                             stop_check=kwargs['stop_check'],
                             call_back=kwargs['call_back'])
                     if 'output_filename' in kwargs and kwargs[
                             'output_filename'] is not None:
                         print_neighden_results(kwargs['output_filename'],
                                                res[1])
                     if self.stopped:
                         break
                     self.results.append([q, res[0]])
             else:
                 end = kwargs['corpusModel'].beginAddColumn(att)
                 if kwargs['algorithm'] != 'substitution':
                     neighborhood_density_all_words(
                         c,
                         algorithm=kwargs['algorithm'],
                         max_distance=kwargs['max_distance'],
                         num_cores=kwargs['num_cores'],
                         call_back=kwargs['call_back'],
                         stop_check=kwargs['stop_check'])
                 else:
                     find_mutation_minpairs_all_words(
                         c,
                         num_cores=kwargs['num_cores'],
                         stop_check=kwargs['stop_check'],
                         call_back=kwargs['call_back'])
                 end = kwargs['corpusModel'].endAddColumn(end)
         except PCTError as e:
             self.errorEncountered.emit(e)
             return
         except Exception as e:
             e = PCTPythonError(e)
             self.errorEncountered.emit(e)
             return
     if self.stopped:
         self.finishedCancelling.emit()
         return
     self.dataReady.emit(self.results)
Example #6
0
def main():

    #### Parse command-line arguments
    parser = argparse.ArgumentParser(description = \
             'Phonological CorpusTools: neighborhood density CL interface')
    parser.add_argument('corpus_file_name', help='Name of corpus file')
    parser.add_argument('query', help='Word to query, or name of file including a list of words')
    parser.add_argument('-c', '--context_type', type=str, default='Canonical', help="How to deal with variable pronunciations. Options are 'Canonical', 'MostFrequent', 'SeparatedTokens', or 'Weighted'. See documentation for details.")
    parser.add_argument('-a', '--algorithm', default= 'edit_distance', help="The algorithm used to determine distance")
    parser.add_argument('-d', '--max_distance', type=int, default = 1, help="Maximum edit distance from the queried word to consider a word a neighbor.")
    parser.add_argument('-s', '--sequence_type', default = 'transcription', help="The name of the tier on which to calculate distance")
    parser.add_argument('-w', '--count_what', default ='type', help="If 'type', count neighbors in terms of their type frequency. If 'token', count neighbors in terms of their token frequency.")
    parser.add_argument('-e', '--trans_delimiter', default='', help="If not empty string, splits the query by this str to make a transcription/spelling list for the query's Word object.")
    parser.add_argument('-m', '--find_mutation_minpairs', action='store_true', help='This flag causes the script not to calculate neighborhood density, but rather to find minimal pairs---see documentation.')
    parser.add_argument('-q', '--force_quadratic_algorithm', action='store_true', help='This flag prevents PCT from using the more efficient linear-time algorithm for edit distance of 1 neighborhoods.')
    parser.add_argument('-o', '--outfile', help='Name of output file')

    args = parser.parse_args()

    ####

    try:
        home = os.path.expanduser('~')
        corpus = load_binary(os.path.join(home, 'Documents', 'PCT', 'CorpusTools', 'CORPUS', args.corpus_file_name))
    except FileNotFoundError:
        corpus = load_binary(args.corpus_file_name)
        
    if args.context_type == 'Canonical':
        corpus = CanonicalVariantContext(corpus, args.sequence_type, type_or_token=args.count_what)
    elif args.context_type == 'MostFrequent':
        corpus = MostFrequentVariantContext(corpus, args.sequence_type, type_or_token=args.count_what)
    elif args.context_type == 'SeparatedTokens':
        corpus = SeparatedTokensVariantContext(corpus, args.sequence_type, type_or_token=args.count_what)
    elif args.context_type == 'Weighted':
        corpus = WeightedVariantContext(corpus, args.sequence_type, type_or_token=args.count_what)

    if args.find_mutation_minpairs:
        query = ensure_query_is_word(args.query, corpus, args.sequence_type, args.trans_delimiter)
        matches = find_mutation_minpairs(corpus, query)
        for match in matches[1]:
            print(match)
        print('Total number of matches: {}'.format(str(matches[0])))
    else:
        try: # read query as a file name
            with open(args.query) as queryfile:
                queries = [line[0] for line in csv.reader(queryfile, delimiter='\t') if len(line) > 0]
                queries = [ensure_query_is_word(q, corpus, args.sequence_type, args.trans_delimiter) for q in queries]
            results = [neighborhood_density(corpus, q, algorithm = args.algorithm, max_distance = args.max_distance,
                                            force_quadratic=args.force_quadratic_algorithm) for q in queries]
            if args.outfile:
                with open(args.outfile, 'w') as outfile:
                    for q, r in zip(queries, results):
                        outfile.write('{}\t{}'.format(q, str(r[0])) + ''.join(['\t{}'.format(str(n)) for n in r[1]]) + '\n')
            else:
                raise Exception('In order to use a file of queries as input, you must provide an output file name using the option -o.')


        except FileNotFoundError: # read query as a single word
            query = ensure_query_is_word(args.query, corpus, args.sequence_type, args.trans_delimiter)
            result = neighborhood_density(corpus, query, algorithm = args.algorithm, max_distance = args.max_distance,
                                          force_quadratic=args.force_quadratic_algorithm)

            if args.outfile:
                with open(args.outfile, 'w') as outfile:
                    outfile.write('{}\t{}'.format(query, str(result[0])) + ''.join(['\t{}'.format(str(n)) for n in result[1]]))
            else:
                print('No output file name provided.')
                print('The neighborhood density of the given form is {}. For a list of neighbors, please provide an output file name.'.format(str(result[0])))
Example #7
0
 def run(self):
     kwargs = self.kwargs
     self.results = []
     context = kwargs.pop('context')
     if context == RestrictedContextWidget.canonical_value:
         cm = CanonicalVariantContext
     elif context == RestrictedContextWidget.frequent_value:
         cm = MostFrequentVariantContext
     corpus = kwargs['corpusModel'].corpus
     st = kwargs['sequence_type']
     tt = kwargs['type_token']
     att = kwargs.get('attribute', None)
     ft = kwargs['frequency_cutoff']
     output = list()
     with cm(corpus, st, tt, attribute=att, frequency_threshold=ft) as c:
         try:
             tierdict = defaultdict(list)
             for entry in c:
                 w = getattr(entry, kwargs['sequence_type'])
                 tierdict[str(w)].append(entry)
             if 'query' in kwargs:
                 # Create a dict with sequence_type keys for constaint-time lookup
                 for q in kwargs['query']:
                     if kwargs['algorithm'] != 'substitution':
                         res = neighborhood_density(
                             c,
                             q,
                             tierdict,
                             algorithm=kwargs['algorithm'],
                             max_distance=kwargs['max_distance'],
                             force_quadratic=kwargs['force_quadratic'],
                             file_type=kwargs['file_type'],
                             tier_type=kwargs['tier_type'],
                             stop_check=kwargs['stop_check'],
                             call_back=kwargs['call_back'])
                     else:
                         res = find_mutation_minpairs(
                             c,
                             q,
                             tier_type=kwargs['tier_type'],
                             stop_check=kwargs['stop_check'],
                             call_back=kwargs['call_back'])
                     if 'output_filename' in kwargs and kwargs[
                             'output_filename'] is not None:
                         print_neighden_results(kwargs['output_filename'],
                                                res[1])
                     if self.stopped:
                         break
                     if kwargs['file_list'] is not None:
                         output.append(','.join([
                             q,
                             str(res[0]), ','.join([str(r) for r in res[1]])
                         ]))
                     self.results.append([q, res[0]])
             else:
                 end = kwargs['corpusModel'].beginAddColumn(att)
                 if kwargs['algorithm'] != 'substitution':
                     neighborhood_density_all_words(
                         c,
                         tierdict,
                         tier_type=kwargs['tier_type'],
                         algorithm=kwargs['algorithm'],
                         max_distance=kwargs['max_distance'],
                         num_cores=kwargs['num_cores'],
                         call_back=kwargs['call_back'],
                         stop_check=kwargs['stop_check'],
                         settable_attr=kwargs['attribute'])
                 else:
                     find_mutation_minpairs_all_words(
                         c,
                         tier_type=kwargs['tier_type'],
                         num_cores=kwargs['num_cores'],
                         stop_check=kwargs['stop_check'],
                         call_back=kwargs['call_back'])
                 end = kwargs['corpusModel'].endAddColumn(end)
         except PCTError as e:
             self.errorEncountered.emit(e)
             return
         except Exception as e:
             e = PCTPythonError(e)
             self.errorEncountered.emit(e)
             return
     if self.stopped:
         self.finishedCancelling.emit()
         return
     if output and kwargs['file_list']:
         with open(kwargs['output_filename'], encoding='utf-8',
                   mode='w') as outf:
             print('Word,Density,Neighbors', file=outf)
             for item in output:
                 print(item, file=outf)
     self.dataReady.emit(self.results)
    def run(self):
        kwargs = self.kwargs
        self.results = []
        context = kwargs.pop('context')
        if context == RestrictedContextWidget.canonical_value:
            cm = CanonicalVariantContext
        elif context == RestrictedContextWidget.frequent_value:
            cm = MostFrequentVariantContext
        corpus = kwargs['corpusModel'].corpus
        st = kwargs['sequence_type']
        tt = kwargs['type_token']
        att = kwargs.get('attribute', None)
        ft = kwargs['frequency_cutoff']
        output = list()

        with cm(corpus, st, tt, attribute=att, frequency_threshold = ft) as c:
            try:
                tierdict = defaultdict(list)
                # Create a dict with sequence_type keys for constant-time lookup
                for entry in c:
                    w = getattr(entry, kwargs['sequence_type'])
                    key = str(w)
                    tierdict[key].append(entry)
                if 'query' in kwargs:#this will be true when searching for a single word (in the corpus or not)
                    last_value_removed = None
                    last_key_removed = None
                    for q in kwargs['query']:
                        q = ensure_query_is_word(q, c, c.sequence_type, kwargs['tier_type'])
                        #the following code for adding/removing keys is to ensure that homophones are counted later in
                        #the ND algorithm (if the user wants to), but that words are not considered their own neighbours
                        #however, we only do this when comparing inside a corpus. when using a list of external words
                        #we don't want to do this, since it's possible for the external list to contain words that
                        #are in the corpus, and removing them gives the wrong ND value in this case
                        if kwargs['in_corpus']:
                            if last_value_removed:
                                tierdict[last_key_removed].append(last_value_removed)
                            w = getattr(q, kwargs['sequence_type'])
                            last_key_removed = str(w)
                            #last_value_removed = tierdict[last_key_removed].pop()
                            for i, item in enumerate(tierdict[last_key_removed]):
                                if str(item) == str(q):
                                    last_value_removed = tierdict[last_key_removed].pop(i)
                                    break

                        #now we call the actual ND algorithms
                        if kwargs['algorithm'] != 'substitution':
                            res = neighborhood_density(c, q, tierdict,
                                                algorithm = kwargs['algorithm'],
                                                max_distance = kwargs['max_distance'],
                                                force_quadratic=kwargs['force_quadratic'],
                                                collapse_homophones = kwargs['collapse_homophones'],
                                                file_type = kwargs['file_type'],
                                                tier_type = kwargs['tier_type'],
                                                sequence_type = kwargs['sequence_type'],
                                                stop_check = kwargs['stop_check'],
                                                call_back = kwargs['call_back'])
                        else:
                            res = find_mutation_minpairs(c, q,
                                        tier_type=kwargs['tier_type'],
                                        collapse_homophones = kwargs['collapse_homophones'],
                                        stop_check = kwargs['stop_check'],
                                        call_back = kwargs['call_back'])
                        if 'output_filename' in kwargs and kwargs['output_filename'] is not None:
                            print_neighden_results(kwargs['output_filename'], res[1], kwargs['output_format'])
                        if self.stopped:
                            break
                        if kwargs['file_list'] is not None:
                            output.append(','.join([str(q), str(res[0]), ','.join([str(r) for r in res[1]])]))
                        self.results.append([q,res[0]])
                else:#this will be the case if searching the entire corpus
                    end = kwargs['corpusModel'].beginAddColumn(att)
                    if kwargs['algorithm'] != 'substitution':
                        results = neighborhood_density_all_words(c, tierdict,
                                                tier_type = kwargs['tier_type'],
                                                algorithm = kwargs['algorithm'],
                                                output_format = kwargs['output_format'],
                                                max_distance = kwargs['max_distance'],
                                                num_cores = kwargs['num_cores'],
                                                call_back = kwargs['call_back'],
                                                stop_check = kwargs['stop_check'],
                                                settable_attr = kwargs['attribute'],
                                                collapse_homophones = kwargs['collapse_homophones']
                                                )
                    else:
                        results = find_mutation_minpairs_all_words(c, tierdict,
                                                tier_type = kwargs['tier_type'],
                                                collapse_homophones = kwargs['collapse_homophones'],
                                                num_cores = kwargs['num_cores'],
                                                stop_check = kwargs['stop_check'],
                                                call_back = kwargs['call_back'])
                    end = kwargs['corpusModel'].endAddColumn(end)
                    if 'output_filename' in kwargs and kwargs['output_filename'] is not None:
                        print_all_neighden_results(kwargs['output_filename'], results)
            except PCTError as e:
                self.errorEncountered.emit(e)
                return
            except Exception as e:
                e = PCTPythonError(e)
                self.errorEncountered.emit(e)
                return
        if self.stopped:
            self.finishedCancelling.emit()
            return
        if output and 'output_filename' in kwargs:
            with open(kwargs['output_filename'], encoding='utf-8', mode='w') as outf:
                print('Word,Density,Neighbors', file=outf)
                for item in output:
                    print(item, file=outf)
        self.dataReady.emit(self.results)